diff --git a/cpp/include/cuopt/linear_programming/utilities/callbacks_implems.hpp b/cpp/include/cuopt/linear_programming/utilities/callbacks_implems.hpp
index e13fda2ba..ef0fe35b9 100644
--- a/cpp/include/cuopt/linear_programming/utilities/callbacks_implems.hpp
+++ b/cpp/include/cuopt/linear_programming/utilities/callbacks_implems.hpp
@@ -32,11 +32,12 @@ class default_get_solution_callback_t : public get_solution_callback_t {
                     void* solution_bound,
                     void* user_data) override
   {
-    PyObject* numpy_matrix = get_numpy_array(data, n_variables);
-    PyObject* numpy_array  = get_numpy_array(objective_value, 1);
-    PyObject* numpy_bound  = get_numpy_array(solution_bound, 1);
-    PyObject* py_user_data = user_data == nullptr ? Py_None : static_cast<PyObject*>(user_data);
-    PyObject* res          = PyObject_CallMethod(this->pyCallbackClass,
+    PyGILState_STATE gstate = PyGILState_Ensure();
+    PyObject* numpy_matrix  = get_numpy_array(data, n_variables);
+    PyObject* numpy_array   = get_numpy_array(objective_value, 1);
+    PyObject* numpy_bound   = get_numpy_array(solution_bound, 1);
+    PyObject* py_user_data  = user_data == nullptr ? Py_None : static_cast<PyObject*>(user_data);
+    PyObject* res           = PyObject_CallMethod(this->pyCallbackClass,
                                         "get_solution",
                                         "(OOOO)",
                                         numpy_matrix,
@@ -47,6 +48,7 @@ class default_get_solution_callback_t : public get_solution_callback_t {
     Py_DECREF(numpy_array);
     Py_DECREF(numpy_bound);
     if (res != nullptr) { Py_DECREF(res); }
+    PyGILState_Release(gstate);
   }
 
   PyObject* pyCallbackClass;
@@ -69,11 +71,12 @@ class default_set_solution_callback_t : public set_solution_callback_t {
                     void* solution_bound,
                     void* user_data) override
   {
-    PyObject* numpy_matrix = get_numpy_array(data, n_variables);
-    PyObject* numpy_array  = get_numpy_array(objective_value, 1);
-    PyObject* numpy_bound  = get_numpy_array(solution_bound, 1);
-    PyObject* py_user_data = user_data == nullptr ? Py_None : static_cast<PyObject*>(user_data);
-    PyObject* res          = PyObject_CallMethod(this->pyCallbackClass,
+    PyGILState_STATE gstate = PyGILState_Ensure();
+    PyObject* numpy_matrix  = get_numpy_array(data, n_variables);
+    PyObject* numpy_array   = get_numpy_array(objective_value, 1);
+    PyObject* numpy_bound   = get_numpy_array(solution_bound, 1);
+    PyObject* py_user_data  = user_data == nullptr ? Py_None : static_cast<PyObject*>(user_data);
+    PyObject* res           = PyObject_CallMethod(this->pyCallbackClass,
                                         "set_solution",
                                         "(OOOO)",
                                         numpy_matrix,
@@ -84,6 +87,7 @@ class default_set_solution_callback_t : public set_solution_callback_t {
     Py_DECREF(numpy_array);
     Py_DECREF(numpy_bound);
     if (res != nullptr) { Py_DECREF(res); }
+    PyGILState_Release(gstate);
   }
 
   PyObject* pyCallbackClass;
diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 6ce9a4f4d..a4e43a1ec 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -898,7 +898,7 @@ struct nondeterministic_policy_t : tree_update_policy_t<i_t, f_t> {
   {
   }
 
-  f_t upper_bound() const override { return bnb.upper_bound_.load(); }
+  f_t upper_bound() const override { return bnb.get_cutoff(); }
 
   void update_pseudo_costs(mip_node_t<i_t, f_t>* node, f_t leaf_obj) override
   {
@@ -1316,10 +1316,11 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.set_log(false);
+  f_t cutoff = get_cutoff();
   if (original_lp_.objective_is_integral) {
-    lp_settings.cut_off = std::ceil(upper_bound_ - settings_.integer_tol) + settings_.dual_tol;
+    lp_settings.cut_off = std::ceil(cutoff - settings_.integer_tol) + settings_.dual_tol;
   } else {
-    lp_settings.cut_off = upper_bound_ + settings_.dual_tol;
+    lp_settings.cut_off = cutoff + settings_.dual_tol;
   }
   lp_settings.inside_mip    = 2;
   lp_settings.time_limit    = settings_.time_limit - toc(exploration_stats_.start_time);
@@ -1426,7 +1427,7 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
     // - The lower bound of the parent is lower or equal to its children
     worker->lower_bound = lower_bound;
 
-    if (lower_bound > upper_bound) {
+    if (lower_bound > get_cutoff()) {
       search_tree_.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
       search_tree_.update(node_ptr, node_status_t::FATHOMED);
       worker->recompute_basis  = true;
@@ -1536,7 +1537,7 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
     f_t rel_gap         = user_relative_gap(original_lp_, upper_bound, lower_bound);
     worker->lower_bound = lower_bound;
 
-    if (node_ptr->lower_bound > upper_bound) {
+    if (node_ptr->lower_bound > get_cutoff()) {
       worker->recompute_basis  = true;
       worker->recompute_bounds = true;
       continue;
@@ -1675,7 +1676,7 @@ void branch_and_bound_t<i_t, f_t>::run_scheduler()
         std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
 
         if (!start_node.has_value()) { continue; }
-        if (upper_bound_ < start_node.value()->lower_bound) {
+        if (get_cutoff() < start_node.value()->lower_bound) {
           // This node was put on the heap earlier but its lower bound is now greater than the
           // current upper bound
           search_tree_.graphviz_node(
@@ -1699,7 +1700,7 @@ void branch_and_bound_t<i_t, f_t>::run_scheduler()
         std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_diving();
 
         if (!start_node.has_value()) { continue; }
-        if (upper_bound_ < start_node.value()->lower_bound ||
+        if (get_cutoff() < start_node.value()->lower_bound ||
             start_node.value()->depth < diving_settings.min_node_depth) {
           continue;
         }
@@ -1767,7 +1768,7 @@ void branch_and_bound_t<i_t, f_t>::single_threaded_solve()
     std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
 
     if (!start_node.has_value()) { continue; }
-    if (upper_bound_ < start_node.value()->lower_bound) {
+    if (get_cutoff() < start_node.value()->lower_bound) {
       // This node was put on the heap earlier but its lower bound is now greater than the
       // current upper bound
       search_tree_.graphviz_node(
@@ -2189,12 +2190,12 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
         return mip_status_t::NUMERICAL;
       }
 
-      if (settings_.reduced_cost_strengthening >= 1 && upper_bound_.load() < last_upper_bound) {
+      if (settings_.reduced_cost_strengthening >= 1 && get_cutoff() < last_upper_bound) {
         mutex_upper_.lock();
-        last_upper_bound = upper_bound_.load();
+        last_upper_bound = get_cutoff();
         std::vector<f_t> lower_bounds;
         std::vector<f_t> upper_bounds;
-        find_reduced_cost_fixings(upper_bound_.load(), lower_bounds, upper_bounds);
+        find_reduced_cost_fixings(get_cutoff(), lower_bounds, upper_bounds);
         mutex_upper_.unlock();
         mutex_original_lp_.lock();
         original_lp_.lower = lower_bounds;
@@ -2372,10 +2373,10 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     return solver_status_;
   }
 
-  if (settings_.reduced_cost_strengthening >= 2 && upper_bound_.load() < last_upper_bound) {
+  if (settings_.reduced_cost_strengthening >= 2 && get_cutoff() < last_upper_bound) {
     std::vector<f_t> lower_bounds;
     std::vector<f_t> upper_bounds;
-    i_t num_fixed = find_reduced_cost_fixings(upper_bound_.load(), lower_bounds, upper_bounds);
+    i_t num_fixed = find_reduced_cost_fixings(get_cutoff(), lower_bounds, upper_bounds);
     if (num_fixed > 0) {
       std::vector<bool> bounds_changed(original_lp_.num_cols, true);
       std::vector<char> row_sense;
@@ -2479,7 +2480,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
         std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
 
         if (!start_node.has_value()) { continue; }
-        if (upper_bound_ < start_node.value()->lower_bound) {
+        if (get_cutoff() < start_node.value()->lower_bound) {
           // This node was put on the heap earlier but its lower bound is now greater than the
           // current upper bound
           search_tree_.graphviz_node(
@@ -3321,7 +3322,7 @@ void branch_and_bound_t<i_t, f_t>::deterministic_sort_replay_events(
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::deterministic_prune_worker_nodes_vs_incumbent()
 {
-  f_t upper_bound = upper_bound_.load();
+  f_t upper_bound = get_cutoff();
 
   for (auto& worker : *deterministic_workers_) {
     // Check nodes in plunge stack - filter in place
@@ -3457,14 +3458,14 @@ void branch_and_bound_t<i_t, f_t>::deterministic_populate_diving_heap()
   const int num_diving                  = deterministic_diving_workers_->size();
   constexpr int target_nodes_per_worker = 10;
   const int target_total                = num_diving * target_nodes_per_worker;
-  f_t upper_bound                       = upper_bound_.load();
+  f_t cutoff                            = get_cutoff();
 
   // Collect candidate nodes from BFS worker backlog heaps
   std::vector<std::pair<mip_node_t<i_t, f_t>*, f_t>> candidates;
 
   for (auto& worker : *deterministic_workers_) {
     for (auto* node : worker.backlog.data()) {
-      if (node->lower_bound < upper_bound) {
+      if (node->lower_bound < cutoff) {
         f_t score = node->objective_estimate;
         if (score >= inf) { score = node->lower_bound; }
         candidates.push_back({node, score});
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index a13d5cedc..fac413024 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -108,6 +108,14 @@ class branch_and_bound_t {
 
   bool stop_for_time_limit(mip_solution_t<i_t, f_t>& solution);
 
+  // Set a cutoff bound from an external source (e.g., early FJ during presolve).
+  // Used for node pruning and reduced cost strengthening but NOT for gap computation.
+  // Unlike upper_bound_, this does not imply a verified incumbent solution exists.
+  void set_initial_cutoff(f_t bound) { initial_cutoff_ = bound; }
+
+  // Effective cutoff for node pruning: min of verified incumbent and external cutoff.
+  f_t get_cutoff() const { return std::min(upper_bound_.load(), initial_cutoff_); }
+
   // Repair a low-quality solution from the heuristics.
   bool repair_solution(const std::vector<f_t>& leaf_edge_norms,
                        const std::vector<f_t>& potential_solution,
@@ -169,9 +177,12 @@ class branch_and_bound_t {
   // Mutex for upper bound
   omp_mutex_t mutex_upper_;
 
-  // Global variable for upper bound
+  // Verified incumbent bound (only set when B&B has an actual integer-feasible solution).
   omp_atomic_t<f_t> upper_bound_;
 
+  // External cutoff from early heuristics (for pruning only, no verified solution).
+  f_t initial_cutoff_{std::numeric_limits<f_t>::infinity()};
+
   // Global variable for incumbent. The incumbent should be updated with the upper bound
   mip_solution_t<i_t, f_t> incumbent_;
 
diff --git a/cpp/src/mip_heuristics/CMakeLists.txt b/cpp/src/mip_heuristics/CMakeLists.txt
index 538e3c49a..7c1035fd1 100644
--- a/cpp/src/mip_heuristics/CMakeLists.txt
+++ b/cpp/src/mip_heuristics/CMakeLists.txt
@@ -40,7 +40,9 @@ set(MIP_NON_LP_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/presolve/trivial_presolve.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump_kernels.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/fj_cpu.cu)
+  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/fj_cpu.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/early_cpufj.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/early_gpufj.cu)
 
 # Choose which files to include based on build mode
 if(BUILD_LP_ONLY)
diff --git a/cpp/src/mip_heuristics/diversity/population.cu b/cpp/src/mip_heuristics/diversity/population.cu
index c2138c91c..ba23d3fb0 100644
--- a/cpp/src/mip_heuristics/diversity/population.cu
+++ b/cpp/src/mip_heuristics/diversity/population.cu
@@ -267,9 +267,9 @@ void population_t<i_t, f_t>::invoke_get_solution_callback(
   f_t user_bound     = context.stats.get_solution_bound();
   solution_t<i_t, f_t> temp_sol(sol);
   problem_ptr->post_process_assignment(temp_sol.assignment);
-  if (context.settings.mip_scaling) {
+  if (context.settings.mip_scaling && context.scaling != nullptr) {
     rmm::device_uvector<f_t> dummy(0, temp_sol.handle_ptr->get_stream());
-    context.scaling.unscale_solutions(temp_sol.assignment, dummy);
+    context.scaling->unscale_solutions(temp_sol.assignment, dummy);
   }
   if (problem_ptr->has_papilo_presolve_data()) {
     problem_ptr->papilo_uncrush_assignment(temp_sol.assignment);
@@ -346,7 +346,9 @@ void population_t<i_t, f_t>::run_solution_callbacks(solution_t<i_t, f_t>& sol)
                  incumbent_assignment.size(),
                  sol.handle_ptr->get_stream());
 
-      if (context.settings.mip_scaling) { context.scaling.scale_solutions(incumbent_assignment); }
+      if (context.settings.mip_scaling && context.scaling != nullptr) {
+        context.scaling->scale_solutions(incumbent_assignment);
+      }
       bool is_valid = problem_ptr->pre_process_assignment(incumbent_assignment);
       if (!is_valid) { return; }
       cuopt_assert(outside_sol.assignment.size() == incumbent_assignment.size(),
diff --git a/cpp/src/mip_heuristics/early_heuristic.cuh b/cpp/src/mip_heuristics/early_heuristic.cuh
new file mode 100644
index 000000000..333fde195
--- /dev/null
+++ b/cpp/src/mip_heuristics/early_heuristic.cuh
@@ -0,0 +1,109 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <mip_heuristics/problem/problem.cuh>
+#include <mip_heuristics/solution/solution.cuh>
+
+#include <cuopt/linear_programming/mip/solver_settings.hpp>
+
+#include <utilities/logger.hpp>
+
+#include <thrust/fill.h>
+
+#include <chrono>
+#include <functional>
+#include <limits>
+#include <vector>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename f_t>
+using early_incumbent_callback_t =
+  std::function<void(f_t solver_obj, f_t user_obj, const std::vector<f_t>& assignment)>;
+
+// CRTP base for early heuristics that run on the original (or papilo-presolved) problem
+// during presolve to find incumbents as early as possible.
+// Derived classes implement start() and stop().
+template <typename i_t, typename f_t, typename Derived>
+class early_heuristic_t {
+ public:
+  early_heuristic_t(const optimization_problem_t<i_t, f_t>& op_problem,
+                    const typename mip_solver_settings_t<i_t, f_t>::tolerances_t& tolerances,
+                    early_incumbent_callback_t<f_t> incumbent_callback)
+    : incumbent_callback_(std::move(incumbent_callback))
+  {
+    RAFT_CUDA_TRY(cudaGetDevice(&device_id_));
+
+    // Build and preprocess on the original handle, then copy onto our own handle
+    // so the derived solver can run on a dedicated stream (prevents graph capture conflicts).
+    problem_t<i_t, f_t> temp_problem(op_problem, tolerances, false);
+    temp_problem.preprocess_problem();
+    temp_problem.handle_ptr->sync_stream();
+    problem_ptr_ = std::make_unique<problem_t<i_t, f_t>>(temp_problem, &handle_);
+
+    solution_ptr_ = std::make_unique<solution_t<i_t, f_t>>(*problem_ptr_);
+    thrust::fill(handle_.get_thrust_policy(),
+                 solution_ptr_->assignment.begin(),
+                 solution_ptr_->assignment.end(),
+                 f_t{0});
+    solution_ptr_->clamp_within_bounds();
+  }
+
+  bool solution_found() const { return solution_found_; }
+  f_t get_best_objective() const { return best_objective_; }
+  void set_best_objective(f_t obj) { best_objective_ = obj; }
+  const std::vector<f_t>& get_best_assignment() const { return best_assignment_; }
+
+ protected:
+  ~early_heuristic_t() = default;
+
+  // NOT thread-safe. solver_obj is in solver-space (always minimization).
+  // Uses a private CUDA stream to avoid racing with the FJ solver's stream.
+  void try_update_best(f_t solver_obj, const std::vector<f_t>& assignment)
+  {
+    if (solver_obj >= best_objective_) { return; }
+    best_objective_ = solver_obj;
+
+    RAFT_CUDA_TRY(cudaSetDevice(device_id_));
+    auto stream = handle_.get_stream();
+    rmm::device_uvector<f_t> d_assignment(assignment.size(), stream);
+    raft::copy(d_assignment.data(), assignment.data(), assignment.size(), stream);
+    problem_ptr_->post_process_assignment(d_assignment, true, stream);
+    auto user_assignment = cuopt::host_copy(d_assignment, stream);
+
+    best_assignment_ = user_assignment;
+    solution_found_  = true;
+    f_t user_obj     = problem_ptr_->get_user_obj_from_solver_obj(solver_obj);
+    double elapsed =
+      std::chrono::duration<double>(std::chrono::steady_clock::now() - start_time_).count();
+    CUOPT_LOG_INFO("Early heuristics (%s) lowered the primal bound. Objective %g. Time %.2f",
+                   Derived::name(),
+                   user_obj,
+                   elapsed);
+    if (incumbent_callback_) { incumbent_callback_(solver_obj, user_obj, user_assignment); }
+  }
+
+  int device_id_{0};
+
+  // handle_ must be declared before problem_ptr_/solution_ptr_ so it outlives them
+  // (C++ destroys members in reverse declaration order)
+  raft::handle_t handle_;
+
+  std::unique_ptr<problem_t<i_t, f_t>> problem_ptr_;
+  std::unique_ptr<solution_t<i_t, f_t>> solution_ptr_;
+
+  bool solution_found_{false};
+  f_t best_objective_{std::numeric_limits<f_t>::infinity()};
+  std::vector<f_t> best_assignment_;
+
+  early_incumbent_callback_t<f_t> incumbent_callback_;
+  std::chrono::steady_clock::time_point start_time_;
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
new file mode 100644
index 000000000..8109653e6
--- /dev/null
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
@@ -0,0 +1,79 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include "early_cpufj.cuh"
+
+#include <mip_heuristics/feasibility_jump/fj_cpu.cuh>
+#include <mip_heuristics/mip_constants.hpp>
+#include <utilities/logger.hpp>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+early_cpufj_t<i_t, f_t>::early_cpufj_t(
+  const optimization_problem_t<i_t, f_t>& op_problem,
+  const typename mip_solver_settings_t<i_t, f_t>::tolerances_t& tolerances,
+  early_incumbent_callback_t<f_t> incumbent_callback)
+  : early_heuristic_t<i_t, f_t, early_cpufj_t<i_t, f_t>>(
+      op_problem, tolerances, std::move(incumbent_callback))
+{
+}
+
+template <typename i_t, typename f_t>
+early_cpufj_t<i_t, f_t>::~early_cpufj_t()
+{
+  stop();
+}
+
+template <typename i_t, typename f_t>
+void early_cpufj_t<i_t, f_t>::start()
+{
+  if (cpu_fj_thread_) { return; }
+
+  this->preemption_flag_.store(false);
+  this->start_time_ = std::chrono::steady_clock::now();
+
+  cpu_fj_thread_ = std::make_unique<cpu_fj_thread_t<i_t, f_t>>();
+  cpu_fj_thread_->fj_cpu =
+    init_fj_cpu_standalone(*this->problem_ptr_, *this->solution_ptr_, preemption_flag_);
+  cpu_fj_thread_->time_limit = std::numeric_limits<f_t>::infinity();
+
+  cpu_fj_thread_->fj_cpu->log_prefix = "[Early CPUFJ] ";
+
+  cpu_fj_thread_->fj_cpu->improvement_callback =
+    [this](f_t solver_obj, const std::vector<f_t>& assignment, double) {
+      this->try_update_best(solver_obj, assignment);
+    };
+
+  cpu_fj_thread_->start_cpu_solver();
+}
+
+template <typename i_t, typename f_t>
+void early_cpufj_t<i_t, f_t>::stop()
+{
+  if (!cpu_fj_thread_) { return; }
+
+  preemption_flag_.store(true);
+  cpu_fj_thread_->stop_cpu_solver();
+  cpu_fj_thread_->wait_for_cpu_solver();
+
+  CUOPT_LOG_DEBUG("[Early CPUFJ] Stopped after %d iterations, solution_found=%d",
+                  cpu_fj_thread_->fj_cpu ? cpu_fj_thread_->fj_cpu->iterations : 0,
+                  this->solution_found_);
+
+  cpu_fj_thread_.reset();
+}
+
+#if MIP_INSTANTIATE_FLOAT
+template class early_cpufj_t<int, float>;
+#endif
+
+#if MIP_INSTANTIATE_DOUBLE
+template class early_cpufj_t<int, double>;
+#endif
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh
new file mode 100644
index 000000000..911e84655
--- /dev/null
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cuh
@@ -0,0 +1,39 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <mip_heuristics/early_heuristic.cuh>
+
+#include <atomic>
+#include <memory>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+struct cpu_fj_thread_t;
+
+template <typename i_t, typename f_t>
+class early_cpufj_t : public early_heuristic_t<i_t, f_t, early_cpufj_t<i_t, f_t>> {
+ public:
+  early_cpufj_t(const optimization_problem_t<i_t, f_t>& op_problem,
+                const typename mip_solver_settings_t<i_t, f_t>::tolerances_t& tolerances,
+                early_incumbent_callback_t<f_t> incumbent_callback);
+
+  ~early_cpufj_t();
+
+  static constexpr const char* name() { return "CPUFJ"; }
+
+  void start();
+  void stop();
+
+ private:
+  std::unique_ptr<cpu_fj_thread_t<i_t, f_t>> cpu_fj_thread_;
+  std::atomic<bool> preemption_flag_{false};
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
new file mode 100644
index 000000000..55726421d
--- /dev/null
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
@@ -0,0 +1,93 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include "early_gpufj.cuh"
+
+#include <mip_heuristics/feasibility_jump/feasibility_jump.cuh>
+#include <mip_heuristics/mip_constants.hpp>
+#include <mip_heuristics/solver_context.cuh>
+#include <utilities/logger.hpp>
+
+#include <raft/core/error.hpp>
+
+#include <limits>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+early_gpufj_t<i_t, f_t>::early_gpufj_t(const optimization_problem_t<i_t, f_t>& op_problem,
+                                       const mip_solver_settings_t<i_t, f_t>& settings,
+                                       early_incumbent_callback_t<f_t> incumbent_callback)
+  : early_heuristic_t<i_t, f_t, early_gpufj_t<i_t, f_t>>(
+      op_problem, settings.get_tolerances(), std::move(incumbent_callback))
+{
+  context_ptr_ = std::make_unique<mip_solver_context_t<i_t, f_t>>(
+    &this->handle_, this->problem_ptr_.get(), settings, nullptr);
+}
+
+template <typename i_t, typename f_t>
+early_gpufj_t<i_t, f_t>::~early_gpufj_t()
+{
+  stop();
+}
+
+template <typename i_t, typename f_t>
+void early_gpufj_t<i_t, f_t>::start()
+{
+  if (worker_thread_) { return; }
+
+  this->start_time_ = std::chrono::steady_clock::now();
+
+  fj_settings_t fj_settings;
+  fj_settings.mode                   = fj_mode_t::EXIT_NON_IMPROVING;
+  fj_settings.n_of_minimums_for_exit = std::numeric_limits<int>::max();
+  fj_settings.time_limit             = std::numeric_limits<f_t>::infinity();
+  fj_settings.iteration_limit        = std::numeric_limits<int>::max();
+  fj_settings.update_weights         = true;
+  fj_settings.feasibility_run        = false;
+
+  fj_ptr_ = std::make_unique<fj_t<i_t, f_t>>(*context_ptr_, fj_settings);
+
+  fj_ptr_->improvement_callback = [this](f_t user_obj, const std::vector<f_t>& h_assignment) {
+    f_t solver_obj = this->problem_ptr_->get_solver_obj_from_user_obj(user_obj);
+    this->try_update_best(solver_obj, h_assignment);
+  };
+
+  worker_thread_ = std::make_unique<std::thread>(&early_gpufj_t::run_worker, this);
+}
+
+template <typename i_t, typename f_t>
+void early_gpufj_t<i_t, f_t>::run_worker()
+{
+  RAFT_CUDA_TRY(cudaSetDevice(this->device_id_));
+  fj_ptr_->solve(*this->solution_ptr_);
+}
+
+template <typename i_t, typename f_t>
+void early_gpufj_t<i_t, f_t>::stop()
+{
+  if (!worker_thread_) { return; }
+
+  context_ptr_->preempt_heuristic_solver_.store(true);
+
+  if (worker_thread_->joinable()) { worker_thread_->join(); }
+
+  CUOPT_LOG_DEBUG("[Early GPU FJ] Stopped, solution_found=%d", this->solution_found_);
+
+  fj_ptr_.reset();
+  worker_thread_.reset();
+}
+
+#if MIP_INSTANTIATE_FLOAT
+template class early_gpufj_t<int, float>;
+#endif
+
+#if MIP_INSTANTIATE_DOUBLE
+template class early_gpufj_t<int, double>;
+#endif
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh
new file mode 100644
index 000000000..4a7769143
--- /dev/null
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cuh
@@ -0,0 +1,45 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <mip_heuristics/early_heuristic.cuh>
+
+#include <memory>
+#include <thread>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t, typename f_t>
+class fj_t;
+
+template <typename i_t, typename f_t>
+struct mip_solver_context_t;
+
+template <typename i_t, typename f_t>
+class early_gpufj_t : public early_heuristic_t<i_t, f_t, early_gpufj_t<i_t, f_t>> {
+ public:
+  early_gpufj_t(const optimization_problem_t<i_t, f_t>& op_problem,
+                const mip_solver_settings_t<i_t, f_t>& settings,
+                early_incumbent_callback_t<f_t> incumbent_callback);
+
+  ~early_gpufj_t();
+
+  static constexpr const char* name() { return "GPUFJ"; }
+
+  void start();
+  void stop();
+
+ private:
+  void run_worker();
+
+  std::unique_ptr<mip_solver_context_t<i_t, f_t>> context_ptr_;
+  std::unique_ptr<fj_t<i_t, f_t>> fj_ptr_;
+  std::unique_ptr<std::thread> worker_thread_;
+};
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu
index e9cf0760d..748dd41df 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cu
@@ -879,7 +879,7 @@ i_t fj_t<i_t, f_t>::host_loop(solution_t<i_t, f_t>& solution, i_t climber_idx)
     // every now and then, ensure external solutions are added to the population
     // this is done here because FJ is called within FP and also after recombiners
     // so FJ is one of the most inner and most frequent functions to be called
-    if (steps % 10000 == 0) {
+    if (steps % 10000 == 0 && context.diversity_manager_ptr != nullptr) {
       context.diversity_manager_ptr->get_population_pointer()
         ->add_external_solutions_to_population();
     }
@@ -933,6 +933,22 @@ i_t fj_t<i_t, f_t>::host_loop(solution_t<i_t, f_t>& solution, i_t climber_idx)
       bool is_feasible = solution.compute_feasibility();
       solution.handle_ptr->sync_stream();
 
+      // Invoke improvement callback if we have a better feasible solution
+      if (is_feasible && improvement_callback) {
+        f_t user_obj = solution.get_user_objective();
+        if (solution.h_obj < last_reported_objective_) {
+          last_reported_objective_ = solution.h_obj;
+          // Copy assignment to host for callback
+          std::vector<f_t> h_assignment(solution.assignment.size());
+          raft::copy(h_assignment.data(),
+                     solution.assignment.data(),
+                     solution.assignment.size(),
+                     climber_stream);
+          climber_stream.synchronize();
+          improvement_callback(user_obj, h_assignment);
+        }
+      }
+
       if (limit_reached) { break; }
 
       if (is_feasible) {
@@ -1059,8 +1075,9 @@ i_t fj_t<i_t, f_t>::solve(solution_t<i_t, f_t>& solution)
 {
   raft::common::nvtx::range scope("fj_solve");
   timer_t timer(settings.time_limit);
-  handle_ptr = const_cast<raft::handle_t*>(solution.handle_ptr);
-  pb_ptr     = solution.problem_ptr;
+  handle_ptr               = const_cast<raft::handle_t*>(solution.handle_ptr);
+  pb_ptr                   = solution.problem_ptr;
+  last_reported_objective_ = std::numeric_limits<f_t>::infinity();
   if (settings.mode != fj_mode_t::ROUNDING) {
     cuopt_func_call(solution.test_variable_bounds(true));
     cuopt_assert(solution.test_number_all_integer(), "All integers must be rounded");
diff --git a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh
index e9040a759..50b451a86 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/feasibility_jump.cuh
@@ -19,11 +19,17 @@
 
 #include <utilities/event_handler.cuh>
 
+#include <functional>
+
 #define FJ_DEBUG_LOAD_BALANCING 0
 #define FJ_SINGLE_STEP          0
 
 namespace cuopt::linear_programming::detail {
 
+template <typename f_t>
+using fj_improvement_callback_t =
+  std::function<void(f_t objective, const std::vector<f_t>& assignment)>;
+
 static constexpr int TPB_resetmoves                 = raft::WarpSize * 4;
 static constexpr int TPB_heavyvars                  = raft::WarpSize * 16;
 static constexpr int TPB_heavycstrs                 = raft::WarpSize * 4;
@@ -628,6 +634,9 @@ class fj_t {
   std::vector<std::unique_ptr<climber_data_t>> climbers;
   rmm::device_uvector<typename climber_data_t::view_t> climber_views;
   fj_settings_t settings;
+
+  fj_improvement_callback_t<f_t> improvement_callback;
+  f_t last_reported_objective_{std::numeric_limits<f_t>::infinity()};
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
index 4d567c9ec..af41545a9 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cu
@@ -790,7 +790,7 @@ static void apply_move(fj_cpu_climber_t<i_t, f_t>& fj_cpu,
       if (fj_cpu.improvement_callback) {
         double current_work_units = fj_cpu.work_units_elapsed.load(std::memory_order_acquire);
         fj_cpu.improvement_callback(
-          fj_cpu.h_best_objective, fj_cpu.h_assignment, current_work_units);
+          fj_cpu.h_incumbent_objective, fj_cpu.h_assignment, current_work_units);
       }
       fj_cpu.feasible_found = true;
     }
@@ -1402,10 +1402,8 @@ std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> fj_t<i_t, f_t>::create_cpu_climber(
 }
 
 template <typename i_t, typename f_t>
-bool fj_t<i_t, f_t>::cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_limit)
+static bool cpufj_solve_loop(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_limit)
 {
-  raft::common::nvtx::range scope("fj_cpu");
-
   i_t local_mins       = 0;
   auto loop_start      = std::chrono::high_resolution_clock::now();
   auto time_limit      = std::chrono::milliseconds((int)(in_time_limit * 1000));
@@ -1507,8 +1505,7 @@ bool fj_t<i_t, f_t>::cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_l
     if (fj_cpu.iterations % fj_cpu.log_interval == 0) {
       CUOPT_LOG_TRACE(
         "%sCPUFJ iteration: %d/%d, local mins: %d, best_objective: %g, viol: %zu, obj weight %g, "
-        "maxw "
-        "%g",
+        "maxw %g",
         fj_cpu.log_prefix.c_str(),
         fj_cpu.iterations,
         fj_cpu.settings.iteration_limit != std::numeric_limits<i_t>::max()
@@ -1537,15 +1534,10 @@ bool fj_t<i_t, f_t>::cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_l
     if (fj_cpu.iterations % 100 == 0 && fj_cpu.iterations > 0) {
       // Collect memory statistics
       auto [loads, stores] = fj_cpu.memory_aggregator.collect();
-
-      double biased_work = (loads + stores) * fj_cpu.work_unit_bias / 1e10;
+      double biased_work   = (loads + stores) * fj_cpu.work_unit_bias / 1e10;
       fj_cpu.work_units_elapsed += biased_work;
 
       if (fj_cpu.producer_sync != nullptr) { fj_cpu.producer_sync->notify_progress(); }
-
-      CUOPT_LOG_TRACE("CPUFJ work units: %f incumbent %g",
-                      fj_cpu.work_units_elapsed.load(std::memory_order_relaxed),
-                      fj_cpu.pb_ptr->get_user_obj_from_solver_obj(fj_cpu.h_best_objective));
     }
 
     cuopt_func_call(sanity_checks(fj_cpu));
@@ -1569,6 +1561,13 @@ bool fj_t<i_t, f_t>::cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_l
   return fj_cpu.feasible_found;
 }
 
+template <typename i_t, typename f_t>
+bool fj_t<i_t, f_t>::cpu_solve(fj_cpu_climber_t<i_t, f_t>& fj_cpu, f_t in_time_limit)
+{
+  raft::common::nvtx::range scope("fj_cpu");
+  return cpufj_solve_loop(fj_cpu, in_time_limit);
+}
+
 template <typename i_t, typename f_t>
 cpu_fj_thread_t<i_t, f_t>::~cpu_fj_thread_t()
 {
@@ -1578,8 +1577,7 @@ cpu_fj_thread_t<i_t, f_t>::~cpu_fj_thread_t()
 template <typename i_t, typename f_t>
 void cpu_fj_thread_t<i_t, f_t>::run_worker()
 {
-  bool solution_found   = fj_ptr->cpu_solve(*fj_cpu, time_limit);
-  cpu_fj_solution_found = solution_found;
+  cpu_fj_solution_found = cpufj_solve_loop(*fj_cpu, time_limit);
 }
 
 template <typename i_t, typename f_t>
@@ -1601,14 +1599,43 @@ void cpu_fj_thread_t<i_t, f_t>::stop_cpu_solver()
   fj_cpu->halted = true;
 }
 
+template <typename i_t, typename f_t>
+std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> init_fj_cpu_standalone(
+  problem_t<i_t, f_t>& problem,
+  solution_t<i_t, f_t>& solution,
+  std::atomic<bool>& preemption_flag,
+  fj_settings_t settings)
+{
+  raft::common::nvtx::range scope("init_fj_cpu_standalone");
+
+  auto fj_cpu = std::make_unique<fj_cpu_climber_t<i_t, f_t>>(preemption_flag);
+
+  std::vector<f_t> default_weights(problem.n_constraints, 1.0);
+  init_fj_cpu(*fj_cpu, solution, default_weights, default_weights, 0.0);
+  fj_cpu->settings      = settings;
+  fj_cpu->settings.seed = cuopt::seed_generator::get_seed();
+
+  return fj_cpu;
+}
+
 #if MIP_INSTANTIATE_FLOAT
 template class fj_t<int, float>;
 template class cpu_fj_thread_t<int, float>;
+template std::unique_ptr<fj_cpu_climber_t<int, float>> init_fj_cpu_standalone(
+  problem_t<int, float>& problem,
+  solution_t<int, float>& solution,
+  std::atomic<bool>& preemption_flag,
+  fj_settings_t settings);
 #endif
 
 #if MIP_INSTANTIATE_DOUBLE
 template class fj_t<int, double>;
 template class cpu_fj_thread_t<int, double>;
+template std::unique_ptr<fj_cpu_climber_t<int, double>> init_fj_cpu_standalone(
+  problem_t<int, double>& problem,
+  solution_t<int, double>& solution,
+  std::atomic<bool>& preemption_flag,
+  fj_settings_t settings);
 #endif
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh
index 7dcc8d39b..3263609a2 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh
+++ b/cpp/src/mip_heuristics/feasibility_jump/fj_cpu.cuh
@@ -210,4 +210,13 @@ struct cpu_fj_thread_t : public cpu_worker_thread_base_t<cpu_fj_thread_t<i_t, f_
   fj_t<i_t, f_t>* fj_ptr{nullptr};
 };
 
+// Standalone CPUFJ init for running without full fj_t infrastructure (avoids GPU allocations).
+// Used for early CPUFJ during presolve.
+template <typename i_t, typename f_t>
+std::unique_ptr<fj_cpu_climber_t<i_t, f_t>> init_fj_cpu_standalone(
+  problem_t<i_t, f_t>& problem,
+  solution_t<i_t, f_t>& solution,
+  std::atomic<bool>& preemption_flag,
+  fj_settings_t settings = fj_settings_t{});
+
 }  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/mip_heuristics/problem/presolve_data.cu b/cpp/src/mip_heuristics/problem/presolve_data.cu
index b11f7b108..1bc05a779 100644
--- a/cpp/src/mip_heuristics/problem/presolve_data.cu
+++ b/cpp/src/mip_heuristics/problem/presolve_data.cu
@@ -107,7 +107,8 @@ template <typename i_t, typename f_t>
 void presolve_data_t<i_t, f_t>::post_process_assignment(
   problem_t<i_t, f_t>& problem,
   rmm::device_uvector<f_t>& current_assignment,
-  bool resize_to_original_problem)
+  bool resize_to_original_problem,
+  rmm::cuda_stream_view stream)
 {
   raft::common::nvtx::range fun_scope("post_process_assignment");
   cuopt_assert(current_assignment.size() == variable_mapping.size(), "size mismatch");
@@ -115,15 +116,15 @@ void presolve_data_t<i_t, f_t>::post_process_assignment(
   auto fixed_assgn = make_span(fixed_var_assignment);
   auto var_map     = make_span(variable_mapping);
   if (current_assignment.size() > 0) {
-    thrust::for_each(problem.handle_ptr->get_thrust_policy(),
+    thrust::for_each(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<i_t>(0),
                      thrust::make_counting_iterator<i_t>(current_assignment.size()),
                      [fixed_assgn, var_map, assgn] __device__(auto idx) {
                        fixed_assgn[var_map[idx]] = assgn[idx];
                      });
   }
-  expand_device_copy(current_assignment, fixed_var_assignment, problem.handle_ptr->get_stream());
-  auto h_assignment = cuopt::host_copy(current_assignment, problem.handle_ptr->get_stream());
+  expand_device_copy(current_assignment, fixed_var_assignment, stream);
+  auto h_assignment = cuopt::host_copy(current_assignment, stream);
   cuopt_assert(additional_var_id_per_var.size() == h_assignment.size(), "Size mismatch");
   cuopt_assert(additional_var_used.size() == h_assignment.size(), "Size mismatch");
   for (i_t i = 0; i < (i_t)h_assignment.size(); ++i) {
@@ -148,14 +149,10 @@ void presolve_data_t<i_t, f_t>::post_process_assignment(
                     h_assignment[sub.substituted_var]);
   }
 
-  raft::copy(current_assignment.data(),
-             h_assignment.data(),
-             h_assignment.size(),
-             problem.handle_ptr->get_stream());
   // this separate resizing is needed because of the callback
+  raft::copy(current_assignment.data(), h_assignment.data(), h_assignment.size(), stream);
   if (resize_to_original_problem) {
-    current_assignment.resize(problem.original_problem_ptr->get_n_variables(),
-                              problem.handle_ptr->get_stream());
+    current_assignment.resize(problem.original_problem_ptr->get_n_variables(), stream);
   }
 }
 
diff --git a/cpp/src/mip_heuristics/problem/presolve_data.cuh b/cpp/src/mip_heuristics/problem/presolve_data.cuh
index 51b6bac95..cac3e7165 100644
--- a/cpp/src/mip_heuristics/problem/presolve_data.cuh
+++ b/cpp/src/mip_heuristics/problem/presolve_data.cuh
@@ -89,7 +89,15 @@ class presolve_data_t {
   bool pre_process_assignment(problem_t<i_t, f_t>& problem, rmm::device_uvector<f_t>& assignment);
   void post_process_assignment(problem_t<i_t, f_t>& problem,
                                rmm::device_uvector<f_t>& current_assignment,
-                               bool resize_to_original_problem = true);
+                               bool resize_to_original_problem,
+                               rmm::cuda_stream_view stream);
+  void post_process_assignment(problem_t<i_t, f_t>& problem,
+                               rmm::device_uvector<f_t>& current_assignment,
+                               bool resize_to_original_problem = true)
+  {
+    post_process_assignment(
+      problem, current_assignment, resize_to_original_problem, problem.handle_ptr->get_stream());
+  }
   void post_process_solution(problem_t<i_t, f_t>& problem, solution_t<i_t, f_t>& solution);
 
   void set_papilo_presolve_data(const third_party_presolve_t<i_t, f_t>* presolver_ptr,
diff --git a/cpp/src/mip_heuristics/problem/problem.cu b/cpp/src/mip_heuristics/problem/problem.cu
index d77e2e5f6..dde334476 100644
--- a/cpp/src/mip_heuristics/problem/problem.cu
+++ b/cpp/src/mip_heuristics/problem/problem.cu
@@ -2045,9 +2045,11 @@ bool problem_t<i_t, f_t>::pre_process_assignment(rmm::device_uvector<f_t>& assig
 
 template <typename i_t, typename f_t>
 void problem_t<i_t, f_t>::post_process_assignment(rmm::device_uvector<f_t>& current_assignment,
-                                                  bool resize_to_original_problem)
+                                                  bool resize_to_original_problem,
+                                                  rmm::cuda_stream_view stream)
 {
-  presolve_data.post_process_assignment(*this, current_assignment, resize_to_original_problem);
+  presolve_data.post_process_assignment(
+    *this, current_assignment, resize_to_original_problem, stream);
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/mip_heuristics/problem/problem.cuh b/cpp/src/mip_heuristics/problem/problem.cuh
index 6cd180a80..9876773ec 100644
--- a/cpp/src/mip_heuristics/problem/problem.cuh
+++ b/cpp/src/mip_heuristics/problem/problem.cuh
@@ -94,7 +94,14 @@ class problem_t {
   void preprocess_problem();
   bool pre_process_assignment(rmm::device_uvector<f_t>& assignment);
   void post_process_assignment(rmm::device_uvector<f_t>& current_assignment,
-                               bool resize_to_original_problem = true);
+                               bool resize_to_original_problem,
+                               rmm::cuda_stream_view stream);
+  void post_process_assignment(rmm::device_uvector<f_t>& current_assignment,
+                               bool resize_to_original_problem = true)
+  {
+    post_process_assignment(
+      current_assignment, resize_to_original_problem, handle_ptr->get_stream());
+  }
   void post_process_solution(solution_t<i_t, f_t>& solution);
   void set_papilo_presolve_data(const third_party_presolve_t<i_t, f_t>* presolver_ptr,
                                 std::vector<i_t> reduced_to_original,
diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu
index 44f0e7468..f7c3063c9 100644
--- a/cpp/src/mip_heuristics/solve.cu
+++ b/cpp/src/mip_heuristics/solve.cu
@@ -7,6 +7,8 @@
 
 #include <cuopt/error.hpp>
 
+#include <mip_heuristics/feasibility_jump/early_cpufj.cuh>
+#include <mip_heuristics/feasibility_jump/early_gpufj.cuh>
 #include <mip_heuristics/mip_constants.hpp>
 #include <mip_heuristics/presolve/third_party_presolve.hpp>
 #include <mip_heuristics/presolve/trivial_presolve.cuh>
@@ -51,10 +53,30 @@ static void init_handler(const raft::handle_t* handle_ptr)
     handle_ptr->get_cusparse_handle(), CUSPARSE_POINTER_MODE_DEVICE, handle_ptr->get_stream()));
 }
 
+template <typename f_t>
+static void invoke_solution_callbacks(
+  const std::vector<internals::base_solution_callback_t*>& mip_callbacks,
+  f_t objective,
+  std::vector<f_t>& assignment,
+  f_t bound)
+{
+  std::vector<f_t> obj_vec   = {objective};
+  std::vector<f_t> bound_vec = {bound};
+  for (auto callback : mip_callbacks) {
+    if (callback != nullptr &&
+        callback->get_type() == internals::base_solution_callback_type::GET_SOLUTION) {
+      auto get_sol_callback = static_cast<internals::get_solution_callback_t*>(callback);
+      get_sol_callback->get_solution(
+        assignment.data(), obj_vec.data(), bound_vec.data(), get_sol_callback->get_user_data());
+    }
+  }
+}
+
 template <typename i_t, typename f_t>
 mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
                                  mip_solver_settings_t<i_t, f_t> const& settings,
-                                 timer_t& timer)
+                                 timer_t& timer,
+                                 f_t initial_cutoff = std::numeric_limits<f_t>::infinity())
 {
   raft::common::nvtx::range fun_scope("run_mip");
   auto constexpr const running_mip = true;
@@ -63,15 +85,6 @@ mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
   auto hyper_params                                     = settings.hyper_params;
   hyper_params.update_primal_weight_on_initial_solution = false;
   hyper_params.update_step_size_on_initial_solution     = true;
-  if (settings.get_mip_callbacks().size() > 0) {
-    auto callback_num_variables = problem.original_problem_ptr->get_n_variables();
-    if (problem.has_papilo_presolve_data()) {
-      callback_num_variables = problem.get_papilo_original_num_variables();
-    }
-    for (auto callback : settings.get_mip_callbacks()) {
-      callback->template setup<f_t>(callback_num_variables);
-    }
-  }
   // if the input problem is empty: early exit
   if (problem.empty) {
     detail::solution_t<i_t, f_t> solution(problem);
@@ -153,6 +166,7 @@ mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
   detail::trivial_presolve(scaled_problem);
 
   detail::mip_solver_t<i_t, f_t> solver(scaled_problem, settings, scaling, timer);
+  solver.context.initial_cutoff = initial_cutoff;
   if (timer.check_time_limit()) {
     CUOPT_LOG_INFO("Time limit reached before main solve");
     detail::solution_t<i_t, f_t> sol(problem);
@@ -160,9 +174,35 @@ mip_solution_t<i_t, f_t> run_mip(detail::problem_t<i_t, f_t>& problem,
     stats.total_solve_time = timer.elapsed_time();
     return sol.get_solution(false, stats, false);
   }
+
+  // Run early CPUFJ on papilo-presolved problem during cuOpt presolve (probing cache).
+  // Stopped by run_solver after presolve completes; its best objective feeds into initial_cutoff.
+  std::unique_ptr<detail::early_cpufj_t<i_t, f_t>> early_cpufj;
+  bool run_early_cpufj = problem.has_papilo_presolve_data() &&
+                         settings.determinism_mode != CUOPT_MODE_DETERMINISTIC &&
+                         problem.original_problem_ptr->get_n_integers() > 0;
+  if (run_early_cpufj) {
+    auto* presolver_ptr = problem.presolve_data.papilo_presolve_ptr;
+    auto mip_callbacks  = settings.get_mip_callbacks();
+    f_t no_bound = problem.presolve_data.objective_scaling_factor >= 0 ? (f_t)-1e20 : (f_t)1e20;
+    auto incumbent_callback = [presolver_ptr, mip_callbacks, no_bound](
+                                f_t solver_obj, f_t user_obj, const std::vector<f_t>& assignment) {
+      std::vector<f_t> user_assignment;
+      presolver_ptr->uncrush_primal_solution(assignment, user_assignment);
+      invoke_solution_callbacks(mip_callbacks, user_obj, user_assignment, no_bound);
+    };
+    early_cpufj = std::make_unique<detail::early_cpufj_t<i_t, f_t>>(
+      *problem.original_problem_ptr, settings.get_tolerances(), incumbent_callback);
+    early_cpufj->set_best_objective(initial_cutoff);
+    early_cpufj->start();
+    solver.context.early_cpufj_ptr = early_cpufj.get();
+    CUOPT_LOG_DEBUG("Started early CPUFJ on papilo-presolved problem during cuOpt presolve");
+  }
+
   auto scaled_sol                 = solver.run_solver();
   bool is_feasible_before_scaling = scaled_sol.get_feasible();
   scaled_sol.problem_ptr          = &problem;
+
   if (settings.mip_scaling) { scaling.unscale_solutions(scaled_sol); }
   // at this point we need to compute the feasibility on the original problem not the presolved one
   bool is_feasible_after_unscaling = scaled_sol.compute_feasibility();
@@ -233,6 +273,10 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
                                       op_problem.get_handle_ptr()->get_stream());
     }
 
+    for (auto callback : settings.get_mip_callbacks()) {
+      callback->template setup<f_t>(op_problem.get_n_variables());
+    }
+
     auto timer = timer_t(time_limit);
 
     double presolve_time = 0.0;
@@ -258,6 +302,45 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
 
     if (!run_presolve) { CUOPT_LOG_INFO("Presolve is disabled, skipping"); }
 
+    // Start early FJ (CPU and GPU) during presolve to find incumbents ASAP
+    // Only run if presolve is enabled (gives FJ time to find solutions)
+    // and we're not in deterministic mode
+    std::unique_ptr<detail::early_cpufj_t<i_t, f_t>> early_cpufj;
+    std::unique_ptr<detail::early_gpufj_t<i_t, f_t>> early_gpufj;
+
+    // Track best incumbent found during presolve (shared across CPU and GPU FJ)
+    std::atomic<f_t> early_best_objective{std::numeric_limits<f_t>::infinity()};
+    std::mutex early_callback_mutex;
+
+    bool run_early_fj = run_presolve && settings.determinism_mode != CUOPT_MODE_DETERMINISTIC &&
+                        op_problem.get_n_integers() > 0 && op_problem.get_n_constraints() > 0;
+    f_t no_bound = problem.presolve_data.objective_scaling_factor >= 0 ? (f_t)-1e20 : (f_t)1e20;
+    if (run_early_fj) {
+      auto early_fj_callback = [&early_best_objective,
+                                &early_callback_mutex,
+                                mip_callbacks = settings.get_mip_callbacks(),
+                                no_bound](
+                                 f_t solver_obj, f_t user_obj, const std::vector<f_t>& assignment) {
+        std::lock_guard<std::mutex> lock(early_callback_mutex);
+        if (solver_obj >= early_best_objective.load()) { return; }
+        early_best_objective.store(solver_obj);
+        auto user_assignment = assignment;
+        invoke_solution_callbacks(mip_callbacks, user_obj, user_assignment, no_bound);
+      };
+
+      // Start early CPUFJ on original problem (will restart on presolved problem after Papilo)
+      early_cpufj = std::make_unique<detail::early_cpufj_t<i_t, f_t>>(
+        op_problem, settings.get_tolerances(), early_fj_callback);
+      early_cpufj->start();
+      CUOPT_LOG_INFO("Started early CPUFJ on original problem");
+
+      // Start early GPU FJ (uses GPU while CPU is busy with Papilo)
+      early_gpufj =
+        std::make_unique<detail::early_gpufj_t<i_t, f_t>>(op_problem, settings, early_fj_callback);
+      early_gpufj->start();
+      CUOPT_LOG_INFO("Started early GPUFJ during presolve");
+    }
+
     auto constexpr const dual_postsolve = false;
     if (run_presolve) {
       detail::sort_csr(op_problem);
@@ -295,12 +378,32 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
       }
       CUOPT_LOG_INFO("Papilo presolve time: %.2f", presolve_time);
     }
+
+    // Stop early GPU FJ now that Papilo presolve is complete
+    if (early_gpufj) {
+      early_gpufj->stop();
+      if (early_gpufj->solution_found()) {
+        CUOPT_LOG_INFO("Early GPU FJ found incumbent with objective %.6e during presolve",
+                       early_gpufj->get_best_objective());
+      }
+      early_gpufj.reset();  // Free GPU memory
+    }
+
+    if (early_cpufj && run_presolve && presolve_result.has_value()) {
+      early_cpufj->stop();
+      if (early_cpufj->solution_found()) {
+        CUOPT_LOG_INFO("Early CPUFJ (original) found incumbent with objective %.6e",
+                       early_cpufj->get_best_objective());
+      }
+      early_cpufj.reset();
+    }
+
     if (settings.user_problem_file != "") {
       CUOPT_LOG_INFO("Writing user problem to file: %s", settings.user_problem_file.c_str());
       op_problem.write_to_mps(settings.user_problem_file);
     }
 
-    auto sol = run_mip(problem, settings, timer);
+    auto sol = run_mip(problem, settings, timer, early_best_objective.load());
 
     if (run_presolve) {
       auto status_to_skip = sol.get_termination_status() == mip_termination_status_t::TimeLimit ||
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index 235d4500d..789e71d2c 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -17,6 +17,7 @@
 #include <branch_and_bound/branch_and_bound.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
 #include <dual_simplex/solve.hpp>
+#include <mip_heuristics/feasibility_jump/early_cpufj.cuh>
 
 #include <raft/sparse/detail/cusparse_wrappers.h>
 #include <raft/core/cusparse_macros.hpp>
@@ -48,7 +49,7 @@ mip_solver_t<i_t, f_t>::mip_solver_t(const problem_t<i_t, f_t>& op_problem,
     context(op_problem.handle_ptr,
             const_cast<problem_t<i_t, f_t>*>(&op_problem),
             solver_settings,
-            scaling),
+            &scaling),
     timer_(timer)
 {
   init_handler(op_problem.handle_ptr);
@@ -113,6 +114,17 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
                               ? std::numeric_limits<f_t>::infinity()
                               : timer_.remaining_time();
   bool presolve_success   = run_presolve ? dm.run_presolve(time_limit) : true;
+
+  // Stop early CPUFJ after cuopt presolve (probing cache) but before main solve
+  if (context.early_cpufj_ptr) {
+    context.early_cpufj_ptr->stop();
+    if (context.early_cpufj_ptr->solution_found()) {
+      f_t obj                = context.early_cpufj_ptr->get_best_objective();
+      context.initial_cutoff = std::min(context.initial_cutoff, obj);
+      CUOPT_LOG_INFO("Early CPUFJ found incumbent with objective %g during presolve", obj);
+    }
+  }
+
   if (!presolve_success) {
     CUOPT_LOG_INFO("Problem proven infeasible in presolve");
     solution_t<i_t, f_t> sol(*context.problem_ptr);
@@ -259,7 +271,14 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     branch_and_bound = std::make_unique<dual_simplex::branch_and_bound_t<i_t, f_t>>(
       branch_and_bound_problem, branch_and_bound_settings, timer_.get_tic_start());
     context.branch_and_bound_ptr = branch_and_bound.get();
-    auto* stats_ptr              = &context.stats;
+
+    // Set cutoff from early FJ if available (initial_cutoff is in solver-space)
+    if (context.initial_cutoff < std::numeric_limits<f_t>::infinity()) {
+      branch_and_bound->set_initial_cutoff(context.initial_cutoff);
+      CUOPT_LOG_INFO("B&B using initial cutoff %.6e from early heuristics", context.initial_cutoff);
+    }
+
+    auto* stats_ptr = &context.stats;
     branch_and_bound->set_user_bound_callback(
       [stats_ptr](f_t user_bound) { stats_ptr->set_solution_bound(user_bound); });
 
diff --git a/cpp/src/mip_heuristics/solver_context.cuh b/cpp/src/mip_heuristics/solver_context.cuh
index baac1dd9d..d1be34ea6 100644
--- a/cpp/src/mip_heuristics/solver_context.cuh
+++ b/cpp/src/mip_heuristics/solver_context.cuh
@@ -28,6 +28,9 @@ namespace cuopt::linear_programming::detail {
 template <typename i_t, typename f_t>
 class diversity_manager_t;
 
+template <typename i_t, typename f_t>
+class early_cpufj_t;
+
 // Aggregate structure containing the global context of the solving process for convenience:
 // The current problem, user settings, raft handle and statistics objects
 template <typename i_t, typename f_t>
@@ -35,7 +38,7 @@ struct mip_solver_context_t {
   explicit mip_solver_context_t(raft::handle_t const* handle_ptr_,
                                 problem_t<i_t, f_t>* problem_ptr_,
                                 mip_solver_settings_t<i_t, f_t> settings_,
-                                pdlp_initial_scaling_strategy_t<i_t, f_t>& scaling)
+                                pdlp_initial_scaling_strategy_t<i_t, f_t>* scaling)
     : handle_ptr(handle_ptr_), problem_ptr(problem_ptr_), settings(settings_), scaling(scaling)
   {
     cuopt_assert(problem_ptr != nullptr, "problem_ptr is nullptr");
@@ -53,7 +56,7 @@ struct mip_solver_context_t {
   diversity_manager_t<i_t, f_t>* diversity_manager_ptr{nullptr};
   std::atomic<bool> preempt_heuristic_solver_ = false;
   const mip_solver_settings_t<i_t, f_t> settings;
-  pdlp_initial_scaling_strategy_t<i_t, f_t>& scaling;
+  pdlp_initial_scaling_strategy_t<i_t, f_t>* scaling;  // nullptr when not available (early FJ)
   solver_stats_t<i_t, f_t> stats;
   // Work limit context for tracking work units in deterministic mode (shared across all timers in
   // GPU heuristic loop)
@@ -61,6 +64,9 @@ struct mip_solver_context_t {
 
   // synchronization every 5 seconds for deterministic mode
   work_unit_scheduler_t work_unit_scheduler_{5.0};
+
+  early_cpufj_t<i_t, f_t>* early_cpufj_ptr{nullptr};
+  f_t initial_cutoff{std::numeric_limits<f_t>::infinity()};
 };
 
 }  // namespace cuopt::linear_programming::detail
diff --git a/python/cuopt/cuopt/linear_programming/solver/solver.pxd b/python/cuopt/cuopt/linear_programming/solver/solver.pxd
index 83b1e0054..adc566aa1 100644
--- a/python/cuopt/cuopt/linear_programming/solver/solver.pxd
+++ b/python/cuopt/cuopt/linear_programming/solver/solver.pxd
@@ -181,9 +181,9 @@ cdef extern from "cuopt/linear_programming/utilities/cython_solve.hpp" namespace
     cdef unique_ptr[solver_ret_t] call_solve(
         data_model_view_t[int, double]* data_model,
         solver_settings_t[int, double]* solver_settings,
-    ) except +
+    ) except + nogil
 
     cdef pair[vector[unique_ptr[solver_ret_t]], double] call_batch_solve( # noqa
         vector[data_model_view_t[int, double] *] data_models,
         solver_settings_t[int, double]* solver_settings,
-    ) except +
+    ) except + nogil
diff --git a/python/cuopt/cuopt/linear_programming/solver/solver_wrapper.pyx b/python/cuopt/cuopt/linear_programming/solver/solver_wrapper.pyx
index 8c1c8fdc3..e7d700ea9 100644
--- a/python/cuopt/cuopt/linear_programming/solver/solver_wrapper.pyx
+++ b/python/cuopt/cuopt/linear_programming/solver/solver_wrapper.pyx
@@ -507,10 +507,13 @@ def Solve(py_data_model_obj, settings, mip=False):
     )
     data_model_obj.set_data_model_view()
 
-    return create_solution(move(call_solve(
-        data_model_obj.c_data_model_view.get(),
-        unique_solver_settings.get(),
-    )), data_model_obj)
+    cdef unique_ptr[solver_ret_t] sol_ret_ptr
+    with nogil:
+        sol_ret_ptr = move(call_solve(
+            data_model_obj.c_data_model_view.get(),
+            unique_solver_settings.get(),
+        ))
+    return create_solution(move(sol_ret_ptr), data_model_obj)
 
 
 cdef set_and_insert_vector(
@@ -535,9 +538,9 @@ def BatchSolve(py_data_model_list, settings):
 
     cdef pair[
         vector[unique_ptr[solver_ret_t]],
-        double] batch_solve_result = (
-        move(call_batch_solve(data_model_views, unique_solver_settings.get())) # noqa
-    )
+        double] batch_solve_result
+    with nogil:
+        batch_solve_result = move(call_batch_solve(data_model_views, unique_solver_settings.get()))  # noqa
 
     cdef vector[unique_ptr[solver_ret_t]] c_solutions = (
         move(batch_solve_result.first)