C++ SDK installation
bash
# Install via apt (Ubuntu 22.04) wget -qO- https://pkg.driftrail.com/gpg.key | sudo apt-key add - echo "deb https://pkg.driftrail.com/apt stable main" | sudo tee /etc/apt/sources.list.d/nerox.list sudo apt-get update && sudo apt-get install -y nerox-cuda-dev # Or via CMake FetchContent # See docs for CMakeLists.txt snippet
Basic usage
cpp
#include <nerox/solver.cuh>
#include <nerox/result.h>
int main() {
// Allocate Q matrix on device
int n = 100;
double* d_Q;
cudaMalloc(&d_Q, n * n * sizeof(double));
// ... fill d_Q with your QUBO matrix ...
// Configure solver
nerox::SolverConfig cfg;
cfg.solver = nerox::SolverType::GPU_ANNEALING;
cfg.n_runs = 512;
cfg.n_sweeps = 10000;
cfg.license_key = std::getenv("NEROX_LICENSE_KEY");
// Submit — uses device memory directly, no copy to host
nerox::Job job = nerox::submit_device_qubo(d_Q, n, cfg);
// Block until done
nerox::Result result = job.wait();
printf("Objective: %.4f\n", result.objective);
for (int i = 0; i < n; i++) printf("%d ", result.solution[i]);
cudaFree(d_Q);
return 0;
}Zero-copy pipeline
When your application builds the QUBO matrix in a CUDA kernel (e.g., from streaming sensor data), pass the device pointer directly to submit_device_qubo. The solver runs on the same device in the same CUDA context — no PCIe transfer needed.
cpp
// Build Q in a kernel my_build_qubo_kernel<<<grid, block>>>(d_Q, d_sensor_data, n); cudaDeviceSynchronize(); // Solve without copying to host nerox::Job job = nerox::submit_device_qubo(d_Q, n, cfg); nerox::Result r = job.wait(); // Pass solution directly to next CUDA stage copy_solution_to_device<<<1, n>>>(d_solution, r.solution.data(), n);
Python interop via device pointer
PyTorch and CuPy tensors expose their data pointer via .data_ptr(). Pass this pointer to the Python SDK's qubo_from_device_ptr to avoid CPU roundtrips in Python pipelines.
python
import torch
import nerox
Q_gpu = torch.randn(100, 100, device='cuda', dtype=torch.float64)
Q_gpu = (Q_gpu + Q_gpu.T) / 2
job = nerox.optimize.qubo_from_device_ptr(
ptr=Q_gpu.data_ptr(),
n=100,
dtype="float64",
device=0,
solver="gpu",
)
result = job.wait()