NEROX/CUDA
Integration

CUDA

Direct C++ API for embedding NEROX solvers into CUDA applications. Submit device-resident QUBO matrices without CPU-GPU memory transfers.

C++ SDK installation

bash
# Install via apt (Ubuntu 22.04)
wget -qO- https://pkg.driftrail.com/gpg.key | sudo apt-key add -
echo "deb https://pkg.driftrail.com/apt stable main" | sudo tee /etc/apt/sources.list.d/nerox.list
sudo apt-get update && sudo apt-get install -y nerox-cuda-dev

# Or via CMake FetchContent
# See docs for CMakeLists.txt snippet

Basic usage

cpp
#include <nerox/solver.cuh>
#include <nerox/result.h>

int main() {
    // Allocate Q matrix on device
    int n = 100;
    double* d_Q;
    cudaMalloc(&d_Q, n * n * sizeof(double));
    // ... fill d_Q with your QUBO matrix ...

    // Configure solver
    nerox::SolverConfig cfg;
    cfg.solver = nerox::SolverType::GPU_ANNEALING;
    cfg.n_runs = 512;
    cfg.n_sweeps = 10000;
    cfg.license_key = std::getenv("NEROX_LICENSE_KEY");

    // Submit — uses device memory directly, no copy to host
    nerox::Job job = nerox::submit_device_qubo(d_Q, n, cfg);

    // Block until done
    nerox::Result result = job.wait();

    printf("Objective: %.4f\n", result.objective);
    for (int i = 0; i < n; i++) printf("%d ", result.solution[i]);

    cudaFree(d_Q);
    return 0;
}

Zero-copy pipeline

When your application builds the QUBO matrix in a CUDA kernel (e.g., from streaming sensor data), pass the device pointer directly to submit_device_qubo. The solver runs on the same device in the same CUDA context — no PCIe transfer needed.

cpp
// Build Q in a kernel
my_build_qubo_kernel<<<grid, block>>>(d_Q, d_sensor_data, n);
cudaDeviceSynchronize();

// Solve without copying to host
nerox::Job job = nerox::submit_device_qubo(d_Q, n, cfg);
nerox::Result r = job.wait();

// Pass solution directly to next CUDA stage
copy_solution_to_device<<<1, n>>>(d_solution, r.solution.data(), n);

Python interop via device pointer

PyTorch and CuPy tensors expose their data pointer via .data_ptr(). Pass this pointer to the Python SDK's qubo_from_device_ptr to avoid CPU roundtrips in Python pipelines.

python
import torch
import nerox

Q_gpu = torch.randn(100, 100, device='cuda', dtype=torch.float64)
Q_gpu = (Q_gpu + Q_gpu.T) / 2

job = nerox.optimize.qubo_from_device_ptr(
    ptr=Q_gpu.data_ptr(),
    n=100,
    dtype="float64",
    device=0,
    solver="gpu",
)
result = job.wait()