diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7a75ff7b..b70226b43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,12 +32,12 @@ endif()
 #set(CMAKE_INSTALL_ALWAYS ON)
 
 # Default CLFAGS and CXXFLAGS:
-set(CMAKE_C_FLAGS_RELEASE "-O2")
-set(CMAKE_CXX_FLAGS_RELEASE "-O2")
-set(CMAKE_C_FLAGS_DEBUG "-g")
-set(CMAKE_CXX_FLAGS_DEBUG "-g")
-set(CMAKE_C_FLAGS_OPTIMISED "-O3")
-set(CMAKE_CXX_FLAGS_OPTIMISED "-O3")
+set(CMAKE_C_FLAGS_RELEASE "-O2 -fopenmp")
+set(CMAKE_CXX_FLAGS_RELEASE "-O2 -fopenmp")
+set(CMAKE_C_FLAGS_DEBUG "-g -fopenmp")
+set(CMAKE_CXX_FLAGS_DEBUG "-g -fopenmp")
+set(CMAKE_C_FLAGS_OPTIMISED "-O3 -fopenmp")
+set(CMAKE_CXX_FLAGS_OPTIMISED "-O3 -fopenmp")
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 set(CMAKE_CXX_STANDARD 17)
@@ -357,10 +357,21 @@ endif()
 
 find_file(HAVE_MEMX_H memx.h /usr/include/memx)
 if(HAVE_MEMX_H)
-  find_library(MEMX_LIBRARIES mx_accl)
+  find_library(MEMX_LIBRARIES memx)
   if(MEMX_LIBRARIES)
-    set(optlibsfound "${optlibsfound} mx_accl")
+    set(optlibsfound "${optlibsfound} memx")
     list(APPEND ZM_BIN_LIBS "${MEMX_LIBRARIES}")
+  else()
+    set(optlibsnotfound "${optlibsnotfound} memx")
+  endif()
+endif()
+
+find_file(HAVE_MX_ACCL_H MxAccl.h /usr/include/memx/accl)
+if(HAVE_MX_ACCL_H)
+  find_library(MX_ACCL_LIBRARIES mx_accl)
+  if(MX_ACCL_LIBRARIES)
+    set(optlibsfound "${optlibsfound} mx_accl")
+    list(APPEND ZM_BIN_LIBS "${MX_ACCL_LIBRARIES}")
   else()
     set(optlibsnotfound "${optlibsnotfound} mx_accl")
   endif()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 81a9dd9a6..acebfa370 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -37,6 +37,9 @@ set(ZM_BIN_SRC_FILES
   zm_libvnc_camera.cpp
   zm_local_camera.cpp
   zm_memx.cpp
+  zm_mx_accl.cpp
+  zm_memx_yolo_core.cpp
+  zm_memx_yolov8.cpp
   zm_monitor.cpp
   zm_monitor_monitorlink.cpp
   zm_monitor_rtsp2web.cpp
diff --git a/src/zm_ai_server.cpp b/src/zm_ai_server.cpp
index 86af25d3b..63e75544c 100644
--- a/src/zm_ai_server.cpp
+++ b/src/zm_ai_server.cpp
@@ -87,11 +87,17 @@ void Usage() {
   Quadra quadra;
 #endif
 
+#undef HAVE_MEMX_H
 #if HAVE_MEMX_H
   #include "zm_memx.h"
   MemX *memx;
 #endif
 
+#if HAVE_MX_ACCL_H
+  #include "zm_mx_accl.h"
+  MxAccl *mx_accl;
+#endif
+
 int main(int argc, char *argv[]) {
   self = argv[0];
 
@@ -172,10 +178,12 @@ int main(int argc, char *argv[]) {
     return 0;
   }
 #endif
+
 #if HAVE_MEMX_H
   Debug(1, "Including MemX");
   memx = new MemX();
-  if (!memx->setup("yolov5", "/var/cache/zoneminder/models/YOLO_v5_nano_leaky_416_416_3_tensorflow.dfp")) {
+  if (!memx->setup("yolov8", "/var/cache/zoneminder/models/YOLO_v8_nano_640_640_3_onnx.dfp")) {
+  //if (!memx->setup("yolov5", "/var/cache/zoneminder/models/YOLO_v5_nano_leaky_416_416_3_tensorflow.dfp")) {
     delete memx;
     memx = nullptr;
     return 0;
@@ -183,6 +191,17 @@ int main(int argc, char *argv[]) {
 #else
   Debug(1, "Not Including MemX");
 #endif
+#if HAVE_MX_ACCL_H
+  Debug(1, "Including MemX_Accl");
+  mx_accl = new MxAccl();
+  if (!mx_accl->setup("yolov8", "/var/cache/zoneminder/models/YOLO_v8_nano_640_640_3_onnx.dfp")) {
+    delete mx_accl;
+    mx_accl = nullptr;
+    return 0;
+  }
+#else
+  Debug(1, "Not Including MXACCL");
+#endif
 
   std::unordered_map<unsigned int, std::shared_ptr<Monitor>> monitors;
   std::unordered_map<unsigned int, AIThread *> threads;
@@ -206,6 +225,9 @@ int main(int argc, char *argv[]) {
 #endif
 #if HAVE_MEMX_H
             , memx
+#endif
+#if HAVE_MX_ACCL_H
+            , mx_accl
 #endif
             );
       }
@@ -278,6 +300,13 @@ void AIThread::Inference() {
     sleep(1);
   }
 #endif
+#ifdef HAVE_MX_ACCL_H
+  Debug(1, "Starting for mx_accl");
+  while (!terminate_ and !( mx_accl_job = mx_accl->get_job() )) {
+    Warning("Waiting for job");
+    sleep(1);
+  }
+#endif
 
 #if HAVE_QUADRA
 #if !SOFT_DRAWBOX
@@ -300,14 +329,13 @@ void AIThread::Inference() {
 
   while (!(terminate_ or zm_terminate)) {
     std::shared_ptr<ZMPacket> packet = nullptr;
-    Debug(1, "Getting lock"); 
+    Debug(4, "Getting shm lock"); 
     // Need to hold the lock because it guards shared_mem as well.
     std::unique_lock<std::mutex> lck(mutex_);
-    Debug(1, "Having lock"); 
+    Debug(4, "Having lock"); 
     while (!(send_queue.size() or terminate_)) {
-      Debug(1, "Waiting");
       condition_.wait(lck);
-      Debug(1, "Send queue size for monitor %d is %zu", monitor_->Id(), send_queue.size());
+      Debug(4, "Send queue size for monitor %d is %zu", monitor_->Id(), send_queue.size());
     }
     if (terminate_) break;
     packet = send_queue.front();
@@ -315,25 +343,21 @@ void AIThread::Inference() {
     if (packet) {
       Monitor::SharedData *shared_data = monitor_->getSharedData();
       Debug(4, "Sending image %d for monitor %d", packet->image_index, monitor_->Id());
-#ifdef HAVE_UNTETHER_H
-      speedai->send_image(job, packet->image);
 
       Image *ai_image = monitor_->GetAnalysisImage(packet->image_index);
-      nlohmann::json detections = speedai->receive_detections(job, monitor_->ObjectDetection_Object_Threshold());
-      Debug(1, "detections %s", detections.dump().c_str());
 
-      if (detections.size()) {
-        draw_boxes(packet->image, ai_image, detections, monitor_->LabelSize(), monitor_->LabelSize());
-        //draw_boxes(drawbox_filter, drawbox_filter_ctx, packet->image, ai_image, detections, monitor_->LabelSize(), monitor_->LabelSize());
-      } else {
-        ai_image->Assign(*packet->image);
-      }
+#ifdef HAVE_UNTETHER_H
+      speedai->send_image(job, packet->image);
+      nlohmann::json detections = speedai->receive_detections(job, monitor_->ObjectDetection_Object_Threshold());
 #endif
 #ifdef HAVE_MEMX_H
       memx->send_image(memx_job, packet->image);
-
-      Image *ai_image = monitor_->GetAnalysisImage(packet->image_index);
       nlohmann::json detections = memx->receive_detections(memx_job, monitor_->ObjectDetection_Object_Threshold());
+#endif
+#ifdef HAVE_MX_ACCL_H
+      mx_accl->send_image(mx_accl_job, packet->image);
+      nlohmann::json detections = mx_accl->receive_detections(mx_accl_job, monitor_->ObjectDetection_Object_Threshold());
+#endif
       Debug(1, "detections %s", detections.dump().c_str());
 
       if (detections.size()) {
@@ -342,7 +366,6 @@ void AIThread::Inference() {
       } else {
         ai_image->Assign(*packet->image);
       }
-#endif
 
       shared_data->last_analysis_index = packet->image_index;
 
@@ -436,7 +459,7 @@ void AIThread::Run() {
         (send_queue.size() <= static_cast<unsigned int>(image_buffer_count))
         ) {
       image_index = shared_data->last_decoder_index % image_buffer_count;
-      Debug(3, "Doing SpeedAI on monitor %d.  Decoder index is %d=%d Our index is %d=%d, queue %zu",
+      Debug(3, "Doing ai on monitor %d.  Decoder index is %d=%d Our index is %d=%d, queue %zu",
           monitor_->Id(),
           decoder_image_count, shared_data->last_decoder_index,
           analysis_image_count, image_index, send_queue.size());
@@ -469,7 +492,7 @@ void AIThread::Run() {
       }
 
     } else {
-      Debug(4, "Not Doing SpeedAI on monitor %d.  Decoder count is %d index %d Our count is %d, last_index is %d, index %d",
+      Debug(4, "Not Doing ai on monitor %d.  Decoder count is %d index %d Our count is %d, last_index is %d, index %d",
           monitor_->Id(), decoder_image_count, shared_data->last_decoder_index,
           shared_data->analysis_image_count, shared_data->last_analysis_index, image_index);
 
@@ -651,6 +674,9 @@ AIThread::AIThread(const std::shared_ptr<Monitor> monitor
 #endif
 #if HAVE_MEMX_H
     , MemX *p_memx
+#endif
+#if HAVE_MX_ACCL_H
+    , MxAccl *p_mx_accl
 #endif
     ) :
   monitor_(monitor), terminate_(false)
@@ -660,6 +686,9 @@ AIThread::AIThread(const std::shared_ptr<Monitor> monitor
 #if HAVE_MEMX_H
   , memx(p_memx)
 #endif
+#if HAVE_MX_ACCL_H
+  , mx_accl(p_mx_accl)
+#endif
 #if HAVE_QUADRA
   ,drawbox_filter(nullptr)
 #endif
diff --git a/src/zm_ai_server.h b/src/zm_ai_server.h
index e258902d9..cb1d992ed 100644
--- a/src/zm_ai_server.h
+++ b/src/zm_ai_server.h
@@ -24,19 +24,23 @@ extern "C" {
 #if HAVE_MEMX_H
   #include "zm_memx.h"
 #endif
+#if HAVE_MX_ACCL_H
+  #include "zm_mx_accl.h"
+#endif
 
 class Monitor;
 
 class AIThread {
  public:
-  explicit AIThread(const std::shared_ptr<Monitor> monitor
 #if HAVE_UNTETHER_H
-      , SpeedAI *speedai
+  explicit AIThread(const std::shared_ptr<Monitor> monitor, SpeedAI *speedai);
 #endif
 #if HAVE_MEMX_H
-      , MemX *memx
+  explicit AIThread(const std::shared_ptr<Monitor> monitor, MemX *memx);
+#endif
+#if HAVE_MX_ACCL_H
+  explicit AIThread(const std::shared_ptr<Monitor> monitor, MxAccl *memx);
 #endif
-      );
   ~AIThread();
   AIThread(AIThread &rhs) = delete;
   AIThread(AIThread &&rhs) = delete;
@@ -62,6 +66,10 @@ class AIThread {
   MemX *memx;
   MemX::Job *memx_job;
 #endif
+#if HAVE_MX_ACCL_H
+  MxAccl *mx_accl;
+  MxAccl::Job *mx_accl_job;
+#endif
 #if HAVE_QUADRA
   Quadra::filter_worker *drawbox_filter;
 #endif
diff --git a/src/zm_memx.cpp b/src/zm_memx.cpp
index 7b1bd0314..9d0db1ad8 100644
--- a/src/zm_memx.cpp
+++ b/src/zm_memx.cpp
@@ -36,7 +36,7 @@
 
 
 #define USE_THREAD 1
-#define BATCH_SIZE 10
+#define BATCH_SIZE 1
 
 #ifdef HAVE_MEMX_H
 static const char *coco_classes[] = {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
@@ -51,14 +51,17 @@ static const char *coco_classes[] = {"person", "bicycle", "car", "motorcycle", "
   "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"};
 
 MemX::MemX() :
-  accl(),
+  model_id(0),
+  input_port(0),
+  output_port(0),
+  obj_thresh(0.25),
+  nms_thresh(0.45),
   terminate_(false),
   batchSize(0),
   inSize(0),
   outSize(0),
   count(0)
 {
-  image_size = av_image_get_buffer_size(AV_PIX_FMT_RGB24, MODEL_WIDTH, MODEL_HEIGHT, 32);
 }
 
 MemX::~MemX() {
@@ -68,6 +71,7 @@ MemX::~MemX() {
     delete jobs.front();
     jobs.pop_front();
   }
+  memx_close(model_id);
 }
 
 bool MemX::setup(
@@ -77,15 +81,35 @@ bool MemX::setup(
   // Load and launch module_
   Debug(1, "MemX: Loading model %s", model_file.c_str());
 
-  std::vector<int> device_ids = {0};
-  std::array<bool, 2> use_model_shape = {false, false};
+  //std::vector<int> device_ids = {0};
+  //std::array<bool, 2> use_model_shape = {false, false};
 
+  uint8_t model_id = 0;
   // Create the accelerator object and load the DFP model
-  accl = new MX::Runtime::MxAccl(filesystem::path(model_file), device_ids, use_model_shape);
+  // bind MPU device group 0 as MX3 to model 0
+  memx_status status = memx_open(model_id, 0, MEMX_DEVICE_CASCADE_PLUS);
+  if (status == 0) {
+    Debug(1, "Success opening MX3");
+  } else {
+    Error("Failure openping MX3");
+    return false;
+  }
 
-  //int dfp_tag = accl.connect_dfp(model_file.c_str());
-  model_info = accl->get_model_info(0);
-  batchSize = 1; // only 1 issupported
+  // download weight memory and 1st model within DFP file to device
+  status = memx_download_model(model_id, model_file.c_str(), 0, MEMX_DOWNLOAD_TYPE_WTMEM_AND_MODEL);
+
+  if (status == 0) {
+    // read back flow 0 (port 0) input feature map shape for debug information
+    memx_get_ifmap_size(0, 0, &input_height, &input_width, &input_depth, &input_channels, &input_format);
+    // for example: shape = (224, 224, 1, 3), format = 1
+    Debug(1, "ifmap shape = (%d, %d, %d, %d), format = %d", input_height, input_width, input_depth, input_channels, input_format);
+    memx_get_ofmap_size(0, 0, &output_height, &output_width, &output_depth, &output_channels, &output_format);
+    Debug(1, "ofmap shape = (%d, %d, %d, %d), format = %d", output_height, output_width, output_depth, output_channels, output_format);
+    image_size = av_image_get_buffer_size(AV_PIX_FMT_RGB24, input_width, input_height, 0);
+  }
+   // Set confidence threshold and process detection results
+   yolov8_handle = std::make_unique<YOLOv8>(input_width, input_height, input_channels, 0, 0);
+   yolov8_handle->SetConfidenceThreshold(obj_thresh);
 #if USE_THREAD
   thread_ = std::thread(&MemX::Run, this);
 #endif
@@ -93,40 +117,143 @@ bool MemX::setup(
 }
 
 void MemX::Run() {
+
+  memx_status status = memx_set_stream_enable(model_id, 0);
+
   while (!(terminate_ or zm_terminate)) {
 
-    Debug(1, "MemX locking, queue size %zu", send_queue.size());
-    std::unique_lock<std::mutex> lck(mutex_);
-    while (send_queue.size() < BATCH_SIZE and ! zm_terminate) {
-      Debug(1, "MemX waiting, queue size %zu", send_queue.size());
-      condition_.wait(lck);
-    }
-    if (send_queue.size() >= BATCH_SIZE) {
-      Debug(1, " batch");
-      for (int i = BATCH_SIZE; i > 0 ; i--) {
-        Job *job = send_queue.front();
+    Job *job = nullptr;
+    {
+      Debug(1, "MemX locking, queue size %zu", send_queue.size());
+      std::unique_lock<std::mutex> lck(mutex_);
+      while (send_queue.size() < BATCH_SIZE and ! zm_terminate) {
+        Debug(1, "MemX waiting, queue size %zu", send_queue.size());
+        condition_.wait(lck);
+      }
+      if (send_queue.size() >= BATCH_SIZE) {
+        Debug(1, " batch");
+        job = send_queue.front();
         send_queue.pop_front();
-        std::unique_lock<std::mutex> job_lck(job->mutex_);
+      } else {
+        Debug(1, "Not batch");
+      }
+    }  // end scope for lock
 
-        Debug(3, "MemX::enqueue");
-        SystemTimePoint starttime = std::chrono::system_clock::now();
-        int err = 0;
-        if (err) {
-//Error("Failed enqueue %s", uai_err_string(err));
-        } else {
-          SystemTimePoint endtime = std::chrono::system_clock::now();
-          if (endtime - starttime > Milliseconds(40)) {
-            Warning("MemX enqueue is too slow: %.3f seconds", FPSeconds(endtime - starttime).count());
-          } else {
-            Debug(3, "MemX enqueue took: %.3f seconds", FPSeconds(endtime - starttime).count());
-          }
-        }
-        job->notify();
-      }  // end foreach job in batch
-    } else {
-      Debug(1, "Not batch");
-    } // end if BATCH
+    if (job) {
+      job->lock();
+      inference(job);
+
+      Debug(1, "notifying");
+      job->notify();
+      Debug(1, "unlocking");
+      job->unlock();
+      Debug(1, "Done notifying");
+      job = nullptr;
+    } // end if job
   }  // end while forever
+
+  status = memx_set_stream_disable(model_id, 0);
+}
+
+#if 0
+// Function to process model output and get bounding boxes
+nlohmann::json MemX::get_detections(float *ofmap, int num_boxes, const cv::Mat &inframe,
+    std::vector<Box> &filtered_boxes,
+    std::vector<std::vector<float>> &filtered_mask_coefs)
+{
+
+  nlohmann::json coco_predictions = nlohmann::json::array();
+
+  std::vector<Box> all_boxes;
+  std::vector<float> all_scores;
+  std::vector<std::vector<float>> mask_coefs;
+
+  // Iterate through the model outputs
+  for (int i = 0; i < num_boxes; ++i)
+  {
+    float *feature_ptr = ofmap_t_ptr + i * num_features;
+
+    // get best class information
+    float confidence;
+    int class_id;
+    get_best_class_info(feature_ptr, confidence, class_id);
+
+    if (confidence > conf_thresh)
+    {
+      // Extract and scale the bounding box coordinates
+      float x0 = feature_ptr[0];
+      float y0 = feature_ptr[1];
+      float w = feature_ptr[2];
+      float h = feature_ptr[3];
+      // Convert boxes from center format (cxcywh) to corner format (xyxy)
+      int x1 = static_cast<int>(x0 - 0.5f * w);
+      int y1 = static_cast<int>(y0 - 0.5f * h);
+      int x2 = x1 + w;
+      int y2 = y1 + h;
+
+      // Rescale boxes to original image size
+      x1 = (x1 - pad_w) / scale_ratio;
+      x2 = (x2 - pad_w) / scale_ratio;
+      y1 = (y1 - pad_h) / scale_ratio;
+      y2 = (y2 - pad_h) / scale_ratio;
+
+      // Clamp box boundaries to image dimensions
+      x1 = std::max(0, std::min(x1, ori_image_width - 1));
+      x2 = std::max(0, std::min(x2, ori_image_width - 1));
+      y1 = std::max(0, std::min(y1, ori_image_height - 1));
+      y2 = std::max(0, std::min(y2, ori_image_height - 1));
+
+      // Add detected box to the list
+      all_boxes.emplace_back(x1, y1, x2, y2, class_id, confidence);
+      all_scores.emplace_back(confidence);
+      cv_boxes.emplace_back(x1, y1, x2 - x1, y2 - y1);
+
+      // Add detected mask coefficient to the list
+      std::vector<float> mask_coef(feature_ptr + 4 + num_class, feature_ptr + num_features);
+      mask_coefs.push_back(mask_coef);
+    }
+  }
+  o // Apply Non-Maximum Suppression (NMS) to remove overlapping boxes
+    std::vector<int> nms_result;
+  if (!cv_boxes.empty()) {
+    cv::dnn::NMSBoxes(cv_boxes, all_scores, conf_thresh, nms_thresh, nms_result);
+    for (int idx : nms_result)
+    {
+      filtered_boxes.push_back(all_boxes[idx]);
+      filtered_mask_coefs.push_back(mask_coefs[idx]);
+    }
+  }
+}
+
+#endif
+
+
+
+memx_status MemX::inference(Job *job) {
+  const int timeout = 200; // 200 ms
+  Debug(3, "MemX::enqueue");
+  SystemTimePoint starttime = std::chrono::system_clock::now();
+  memx_status status = memx_stream_ifmap(model_id, input_port, static_cast<void *>(job->ifmap.data()), timeout);
+
+  if (status) {
+    Error("Failed enqueue %d", status);
+    return status;
+  } else {
+    SystemTimePoint endtime = std::chrono::system_clock::now();
+    if (endtime - starttime > Milliseconds(40)) {
+      Warning("MemX enqueue is too slow: %.3f seconds", FPSeconds(endtime - starttime).count());
+    } else {
+      Debug(3, "MemX enqueue took: %.3f seconds", FPSeconds(endtime - starttime).count());
+    }
+  }
+
+  // 3. Read output feature map from device after inference
+  if (0 == status) {
+    Debug(1, "Reading ofmap");
+    status = memx_stream_ofmap(model_id, output_port, static_cast<void *>(job->ofmap.data()), timeout);
+    Debug(1, "Status from reading ofmap %d", status);
+  }
+  return status;
 }
 
 MemX::Job * MemX::send_image(Job *job, Image *image) {
@@ -145,26 +272,65 @@ MemX::Job * MemX::send_packet(Job *job, std::shared_ptr<ZMPacket> packet) {
 }
 
 MemX::Job * MemX::get_job() {
-  Job *job = new Job(accl);
+  Job *job = new Job();
+  AVFrame *frame = job->scaled_frame.get();
+
+  job->scaled_frame->width = input_width;
+  job->scaled_frame->height = input_height;
+  job->scaled_frame->format = AV_PIX_FMT_RGB24;
+
+  Debug(1, "sw scale: scaled frame data 0x%lx, linesize %d/%d/%d/%d, height %d",
+      (unsigned long)frame->data, frame->linesize[0], frame->linesize[1],
+      frame->linesize[2], frame->linesize[3], frame->height);
+
+#if 1
   if (av_frame_get_buffer(job->scaled_frame.get(), 32)) {
     Error("cannot allocate scaled frame buffer");
     return nullptr;
   }
+#endif
+
+  job->ifmap.resize(input_width*input_width*input_depth*input_channels);
+  job->ofmap.resize(output_width*output_width*output_depth*output_channels);
+
+#if 0
+  AVBufferRef *ref = av_buffer_create(reinterpret_cast<uint8_t *>(&job->ifmap),
+      input_width*input_width*input_depth*input_channels,
+      dont_free, /* Free callback */
+      nullptr, /* opaque */
+      0 /* flags */
+      );
+  if (!ref) {
+    Warning("Failed to create av_buffer");
+  }
+  job->scaled_frame->buf[0] = ref;
+
+  int rc_size = av_image_fill_arrays(
+                  job->scaled_frame->data, job->scaled_frame->linesize,
+                  reinterpret_cast<uint8_t *>(&job->ifmap), AV_PIX_FMT_RGB24, input_width, input_height,
+                  0 //alignment
+                );
+  if (rc_size < 0) {
+    Error("Problem setting up data pointers into image %s", av_make_error_string(rc_size).c_str());
+  } else {
+    Debug(1, "Image.Populate frame rc/size %d %dx%d", rc_size, job->scaled_frame->width, job->scaled_frame->height);
+  }
+#endif
 
   std::unique_lock<std::mutex> lck(mutex_);
-
   jobs.push_back(job);
   return job;
-}
+}  // end MemX::Job * MemX::get_job() 
 
 MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) {
   count++;
   Debug(1, "MemX::send_frame %d", count);
 
   SystemTimePoint starttime = std::chrono::system_clock::now();
+
   job->sw_scale_ctx = sws_getCachedContext(job->sw_scale_ctx,
         avframe->width, avframe->height, static_cast<AVPixelFormat>(avframe->format),
-        MODEL_WIDTH, MODEL_HEIGHT, AV_PIX_FMT_RGB24,
+        input_width, input_height, AV_PIX_FMT_RGB24,
         SWS_BICUBIC, nullptr, nullptr, nullptr);
 
   //Debug(1, "Start scale");
@@ -176,8 +342,53 @@ MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) {
         avframe->linesize[2], avframe->linesize[3], avframe->height, job->scaled_frame->linesize[0]);
     return nullptr;
   }
-  job->m_width_rescale = ((float)MODEL_WIDTH / (float)avframe->width);
-  job->m_height_rescale = ((float)MODEL_HEIGHT / (float)avframe->height);
+  job->m_width_rescale = ((float)input_width / (float)avframe->width);
+  job->m_height_rescale = ((float)input_height / (float)avframe->height);
+
+  std::vector<float *> accl_input_data;
+  accl_input_data.resize(input_width*input_height*input_depth*input_channels);
+
+#if 0
+  yolov8_handle->PreProcess(job->scaled_frame->data[0], input_width, input_height, accl_input_data);
+
+#else
+  int offset = 0; //padding_height_ * input_width_ * 3;
+  float *buffer_ptr = job->ifmap.data() + offset; // YOLOv8 has 1 input
+  uint8_t *input_buffer_ptr = job->scaled_frame->data[0]; 
+
+  for (int row = 0; row < input_height; ++row) {
+    for (int col = 0; col < input_width; ++col) {
+#pragma omp simd
+      for(int i=0; i < 3; i++) {
+        buffer_ptr[0] = input_buffer_ptr[0] / (float)255.0; // red
+        buffer_ptr[1] = input_buffer_ptr[1] / (float)255.0; // green
+        buffer_ptr[2] = input_buffer_ptr[2] / (float)255.0; // blue
+      }
+      buffer_ptr += 3;
+      input_buffer_ptr += 3;
+    }
+  }
+#endif
+
+#if 1
+  inference(job);
+  YOLOv8Result result;
+  //yolov8_handle->PostProcess(job->ofmap, result);
+#else
+  // Prevents processing? 
+  job->lock();
+  //std::unique_lock<std::mutex> job_lck(job->mutex_);
+  {
+    Debug(1, "Locking");
+    std::unique_lock<std::mutex> lck(mutex_);
+    send_queue.push_back(job);
+    condition_.notify_all();
+  }
+  Debug(1, "Waiting for inference");
+  job->wait();
+  Debug(1, "Done Waiting");
+#endif
+  starttime = std::chrono::system_clock::now();
 
   SystemTimePoint endtime = std::chrono::system_clock::now();
   if (endtime - starttime > Milliseconds(60)) {
@@ -190,10 +401,11 @@ MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) {
 
 const nlohmann::json MemX::receive_detections(Job *job, float object_threshold) {
   nlohmann::json coco_object;
-  coco_object = convert_predictions_to_coco_format(job->predictions_buffer, job->m_width_rescale, job->m_height_rescale, object_threshold);
+  coco_object = convert_predictions_to_coco_format(job->ofmap, job->m_width_rescale, job->m_height_rescale, object_threshold);
   return coco_object;
 }
 
+#if 0
 nlohmann::json MemX::convert_predictions_to_coco_format(
     const std::vector<float>& predictions,
     float m_width_rescale,
@@ -238,5 +450,6 @@ nlohmann::json MemX::convert_predictions_to_coco_format(
   }
   return coco_predictions;
 }
+#endif
 
 #endif
diff --git a/src/zm_memx.h b/src/zm_memx.h
index fbb6912f2..5598480b8 100644
--- a/src/zm_memx.h
+++ b/src/zm_memx.h
@@ -20,19 +20,22 @@ extern "C" {
 }
 
 #ifdef HAVE_MEMX_H
-//#include "memx/memx.h"
-#include <memx/accl/MxAccl.h>
-
-// FIXME move model stuff to it's ownfile
-#define MODEL_WIDTH 640
-#define MODEL_HEIGHT 640
-// * 6; // 256 boxes, each with 6 elements [l, t, r, b, class, score]
-#define NUM_NMS_PREDICTIONS 256
+#include "memx/memx.h"
+#include "zm_memx_yolov8.h"
 
 class MemX {
   private:
-    MX::Runtime::MxAccl *accl;
-    MX::Types::MxModelInfo model_info;
+    std::unique_ptr<YOLOv8> yolov8_handle;
+
+    uint8_t model_id;
+    uint8_t input_port;
+    uint8_t output_port;
+
+    float obj_thresh = 0.25;
+    float nms_thresh = 0.45;
+
+    int32_t input_height, input_width, input_depth, input_channels, input_format;
+    int32_t output_height, output_width, output_depth, output_channels, output_format;
 
     std::mutex  mutex_;
     std::condition_variable condition_;
@@ -50,7 +53,6 @@ class MemX {
   public:
     class Job {
       private:
-         MX::Runtime::MxAccl *m_accl;
       public:
 
         int index;
@@ -58,13 +60,15 @@ class MemX {
         float m_width_rescale;
         float m_height_rescale;
         SwsContext *sw_scale_ctx;
-        std::vector<float> predictions_buffer;
+
+        std::vector<float> ifmap;
+        std::vector<float> ofmap;
+
         std::mutex  mutex_;
         std::unique_lock<std::mutex> lck_;
         std::condition_variable condition_;
 
-        Job(MX::Runtime::MxAccl *p_accl) :
-          m_accl(p_accl),
+        Job() :
           index(0),
           m_width_rescale(1.0),
           m_height_rescale(1.0),
@@ -72,10 +76,6 @@ class MemX {
           lck_(mutex_, std::defer_lock)
           {
             scaled_frame = av_frame_ptr(zm_av_frame_alloc());
-            scaled_frame->width = MODEL_WIDTH;
-            scaled_frame->height = MODEL_HEIGHT;
-            scaled_frame->format = AV_PIX_FMT_RGB24;
-            predictions_buffer.resize(NUM_NMS_PREDICTIONS*6);
           };
         ~Job() {
           if (sw_scale_ctx) {
@@ -85,14 +85,12 @@ class MemX {
         void setFrame(AVFrame *frame) {
         }
         Job(Job &&in) :
-          m_accl(in.m_accl),
           index(in.index),
           scaled_frame(std::move(in.scaled_frame)),
           m_width_rescale(in.m_width_rescale),
           m_height_rescale(in.m_height_rescale)
         {
           Debug(1, "In move");
-          //in.scaled_frame = nullptr;
         }
         void lock() {
           lck_.lock();
@@ -117,6 +115,7 @@ class MemX {
     Job * send_packet(Job *job, std::shared_ptr<ZMPacket>);
     Job * send_image(Job *job, Image *image);
     Job * send_frame(Job *job, AVFrame *);
+    memx_status inference(Job *job);
 
     const nlohmann::json receive_detections(Job *job, float threshold);
     nlohmann::json convert_predictions_to_coco_format(const std::vector<float>& predictions, float, float, float threshold);
diff --git a/src/zm_memx_accl_yolov8.cpp b/src/zm_memx_accl_yolov8.cpp
new file mode 100644
index 000000000..2b836a2b6
--- /dev/null
+++ b/src/zm_memx_accl_yolov8.cpp
@@ -0,0 +1,313 @@
+#include "zm_memx_yolov8.h"
+#include "zm_logger.h"
+#include <stdexcept>
+
+YOLOv8::YOLOv8(size_t input_width, size_t input_height, size_t input_channel, float confidence_thresh, float iou_thresh) {
+  Init(input_width, input_height, input_channel, confidence_thresh, iou_thresh, COCO_CLASS_NUMBER, COCO_NAMES);
+}
+
+YOLOv8::YOLOv8(size_t input_width, size_t input_height, size_t input_channel,
+    float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels)
+{
+  Init(input_width, input_height, input_channel, confidence_thresh, iou_thresh, class_count, class_labels);
+}
+
+void YOLOv8::Init(size_t accl_input_width, size_t accl_input_height, size_t accl_input_channel,
+    float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels)
+{
+  if (confidence_thresh < 0.0f || confidence_thresh > 1.0f || iou_thresh < 0.0f || iou_thresh > 1.0f) {
+    throw std::invalid_argument("Confidence and IOU threshold must be between 0.0 and 1.0.");
+  }
+
+  if (accl_input_width != accl_input_height) {
+    throw std::invalid_argument("The YOLOv8 api requires model input to have equal width and height dimensions."
+        "Currently, inconsistent input dimensions are not supported.");
+  }
+
+  accl_input_width_ = accl_input_width;
+  accl_input_height_ = accl_input_height;
+  accl_input_channel_ = accl_input_channel;
+  confidence_thresh_ = confidence_thresh;
+  iou_thresh_ = iou_thresh;
+  class_labels_ = class_labels;
+  class_count_ = class_count;
+  valid_input_ = false;
+
+  yolo_post_layers_[0] = {
+    .coordinate_ofmap_flow_id = 0,
+    .confidence_ofmap_flow_id = 1,
+    .width = accl_input_width_ / 8,   // L0_HW, 640 / 8 = 80
+    .height = accl_input_height_ / 8, // L0_HW, 640 / 8 = 80
+    .ratio = 8,
+    .coordinate_fmap_size = 64,
+  };
+
+  yolo_post_layers_[1] = {
+    .coordinate_ofmap_flow_id = 2,
+    .confidence_ofmap_flow_id = 3,
+    .width = accl_input_width_ / 16,   // L1_HW, 640 / 16 = 40
+    .height = accl_input_height_ / 16, // L1_HW, 640 / 16 = 40
+    .ratio = 16,
+    .coordinate_fmap_size = 64,
+  };
+
+  yolo_post_layers_[2] = {
+    .coordinate_ofmap_flow_id = 4,
+    .confidence_ofmap_flow_id = 5,
+    .width = accl_input_width_ / 32,   // L2_HW, 640 / 32 = 20
+    .height = accl_input_height_ / 32, // L2_HW, 640 / 32 = 20
+    .ratio = 32,
+    .coordinate_fmap_size = 64,
+  };
+}
+
+YOLOv8::~YOLOv8()
+{
+}
+
+float YOLOv8::GetConfidenceThreshold() {
+  std::lock_guard<std::mutex> guard(confidence_mutex_);
+  return confidence_thresh_;
+}
+
+void YOLOv8::SetConfidenceThreshold(float confidence) {
+  if (confidence < 0.0f || confidence > 1.0f) {
+    throw std::invalid_argument("Confidence threshold must be between 0.0 and 1.0.");
+  }
+
+  std::lock_guard<std::mutex> guard(confidence_mutex_);
+  confidence_thresh_ = confidence;
+}
+
+bool YOLOv8::IsHorizontalInput(int disp_width, int disp_height) {
+  if (disp_height > disp_width) {
+    Error("Invalid display image: only horizontal images are supported.");
+    return false;
+  }
+  return true;
+}
+
+void YOLOv8::ComputePadding(int disp_width, int disp_height) {
+  if (!IsHorizontalInput(disp_width, disp_height))
+    return;
+
+  //  accl_input_height_ is equal to accl_input_width_
+  letterbox_ratio_ = (float)accl_input_height_ / mxutil_max(disp_width, disp_height);
+
+  letterbox_width_ = disp_width * letterbox_ratio_;
+  letterbox_height_ = disp_height * letterbox_ratio_;
+
+  padding_width_ = (accl_input_width_ - letterbox_width_) / 2;
+  padding_height_ = (accl_input_height_ - letterbox_height_) / 2;
+
+  valid_input_ = true;
+}
+
+void YOLOv8::PreProcess(uint8_t *rgb_data, int image_width, int image_height, std::vector<float *> input_buffers) {
+  if (!valid_input_) {
+    throw std::runtime_error("Make sure to call ComputePadding() before further processing.");
+  }
+
+  int offset = padding_height_ * accl_input_width_ * 3;
+  float *buffer_ptr = input_buffers[0] + offset; // YOLOv8 has 1 input
+
+#if 0
+  cv::Mat src_img = cv::Mat(image_height, image_width, CV_8UC3, rgb_data);
+  cv::Mat resized_img;
+  cv::resize(src_img, resized_img, cv::Size(letterbox_width_, letterbox_height_), 0, 0, cv::INTER_LINEAR);
+
+
+  for (int row = 0; row < resized_img.rows; ++row) {
+    const cv::Vec3b *pixel_row = resized_img.ptr<cv::Vec3b>(row);
+
+    for (int col = 0; col < resized_img.cols; ++col) {
+      const cv::Vec3b &pixelValue = pixel_row[col];
+
+#pragma omp simd
+      for(int i=0; i < 3; i++) {
+        buffer_ptr[0] = pixelValue[0] / (float)255.0; // red
+        buffer_ptr[1] = pixelValue[1] / (float)255.0; // green
+        buffer_ptr[2] = pixelValue[2] / (float)255.0; // blue
+      }
+      buffer_ptr += 3;
+    }
+  }
+#else
+
+  uint8_t *input_buffer_ptr = rgb_data;
+
+  for (int row = 0; row < image_height; ++row) {
+    for (int col = 0; col < image_width; ++col) {
+#pragma omp simd
+      for(int i=0; i < 3; i++) {
+        buffer_ptr[0] = input_buffer_ptr[0] / (float)255.0; // red
+        buffer_ptr[1] = input_buffer_ptr[1] / (float)255.0; // green
+        buffer_ptr[2] = input_buffer_ptr[2] / (float)255.0; // blue
+      }
+      buffer_ptr += 3;
+      input_buffer_ptr += 3;
+    }
+  }
+#endif
+}
+
+
+void YOLOv8::GetDetection(
+    std::queue<BBox> &bboxes,
+    int layer_id,
+    float *confidence_cell_buf,
+    float *coordinate_cell_buf,
+    int row, int col, float *confs_tmp)
+{
+  // process confidence score
+  float best_label_score = -1.0;
+  int best_label;
+
+  // Force use of SIMD to calculate sigmoid.
+  //
+  // The space tradeoff of the temp result buffer
+  // is more than worth the speedup / CPU usage reduction
+  // (~75% CPU [down from 90% due to other changes] down to 59% CPU on my laptop)
+#pragma omp simd
+  for (size_t label = 0; label < class_count_; label++) {
+    confs_tmp[label] = mxutil_prepost_sigmoid(confidence_cell_buf[label]);
+  }
+  // no way to avoid O(n) here, though
+  for (size_t label = 0; label < class_count_; label++) {
+    if (confs_tmp[label] > best_label_score) {
+      best_label_score = confs_tmp[label];
+      best_label = label;
+    }
+  }
+
+  if (best_label_score < confidence_thresh_)
+    return;
+
+  std::vector<float> feature_value{};
+
+  for (int channel = 0; channel < 4; channel++) { // split 64 into 4*16
+    float value = 0.0;
+    float *feature_buf = coordinate_cell_buf + channel * 16;
+    float softmax_sum = 0.0;
+    float local_max = feature_buf[0];
+
+    // apply softmax and weighted sum
+    for (int i = 1; i < 16; i++) {
+      if (feature_buf[i] > local_max)
+        local_max = feature_buf[i];
+    }
+
+    // more SIMD hints
+#pragma omp simd reduction(+:softmax_sum)
+    for (int i = 0; i < 16; i++) {
+      softmax_sum += expf(feature_buf[i] - local_max);
+    }
+
+    // more SIMD hints
+#pragma omp simd reduction(+:value)
+    for (int i = 0; i < 16; i++) {
+      value += ((float)i * (float)(expf(feature_buf[i] - local_max) / softmax_sum));
+    }
+    feature_value.push_back(value);
+  }
+
+  float center_x, center_y, w, h;
+
+  center_x = (feature_value[2] - feature_value[0] + 2 * (0.5 + ((float)col))) * 0.5 * yolo_post_layers_[layer_id].ratio;
+  center_y = (feature_value[3] - feature_value[1] + 2 * (0.5 + ((float)row))) * 0.5 * yolo_post_layers_[layer_id].ratio;
+  w = (feature_value[2] + feature_value[0]) * yolo_post_layers_[layer_id].ratio;
+  h = (feature_value[3] + feature_value[1]) * yolo_post_layers_[layer_id].ratio;
+
+  // printf("[Layer%d] (%0.3f) %s\t: %.2f\t,%.2f\t,%.2f\t,%.2f\t]\n", layer_id, best_label_score, class_labels_[best_label], center_x, center_y, w, h);
+
+  float min_x = mxutil_max(center_x - 0.5 * w, .0);
+  float min_y = mxutil_max(center_y - 0.5 * h, .0);
+  float max_x = mxutil_min(center_x + 0.5 * w, accl_input_width_);
+  float max_y = mxutil_min(center_y + 0.5 * h, accl_input_height_);
+
+  BBox bbox(best_label, best_label_score, min_x, min_y, max_x, max_y);
+
+  non_maximum_suppression(bboxes, bbox, iou_thresh_);
+}
+
+void YOLOv8::PostProcess(std::vector<float *> output_buffers, YOLOv8Result &result) {
+  if (!valid_input_) {
+    throw std::runtime_error("Make sure to call ComputePadding() before further processing.");
+  }
+
+  if (output_buffers.empty()) {
+    throw std::invalid_argument("output_buffers cannot be null.");
+  }
+
+  float *confs_tmp = new float[class_count_];
+
+  for (size_t layer_id = 0; layer_id < kNumPostProcessLayers; ++layer_id) {
+    const auto &layer = yolo_post_layers_[layer_id];
+    const int confidence_floats_per_row = (layer.width * class_count_);               // 80 x 80
+    const int coordinate_floats_per_row = (layer.width * layer.coordinate_fmap_size); // 80 x 64
+
+    const int conf_id = layer.confidence_ofmap_flow_id;
+    const int coord_id = layer.coordinate_ofmap_flow_id;
+
+    float *confidence_base = output_buffers[conf_id];
+    float *coordinate_base = output_buffers[coord_id];
+
+    if (!confidence_base || !coordinate_base) {
+      throw std::invalid_argument("One or more output buffers are null.");
+    }
+
+    for (size_t row = 0; row < layer.height; row++) {
+      float *confidence_row_buf = confidence_base + row * confidence_floats_per_row;
+      float *coordinate_row_buf = coordinate_base + row * coordinate_floats_per_row;
+
+      for (size_t col = 0; col < layer.width; col++) {
+        float *confidence_cell_buf = confidence_row_buf + col * class_count_;
+        float *coordinate_cell_buf = coordinate_row_buf + col * layer.coordinate_fmap_size;
+        GetDetection(result.bboxes, layer_id, confidence_cell_buf, coordinate_cell_buf, row, col, confs_tmp);
+      }
+    }
+  }
+
+  delete [] confs_tmp;
+}
+
+void YOLOv8::CalculateBboxParams(const float *feature_values, int layer_id, int row, int col,
+    float &center_x, float &center_y, float &box_width, float &box_height)
+{
+  const float ratio = yolo_post_layers_[layer_id].ratio;
+
+  center_x = (feature_values[2] - feature_values[0] + 2.0f * (0.5f + static_cast<float>(col))) * 0.5f * ratio;
+  center_y = (feature_values[3] - feature_values[1] + 2.0f * (0.5f + static_cast<float>(row))) * 0.5f * ratio;
+  box_width = (feature_values[2] + feature_values[0]) * ratio;
+  box_height = (feature_values[3] + feature_values[1]) * ratio;
+}
+
+void YOLOv8::GetFeatureValues(const float *coordinate_buffer, float *feature_values)
+{
+  constexpr int kNumChannels = 4;
+  constexpr int kChannelSize = 16;
+
+  for (int channel = 0; channel < kNumChannels; ++channel) {
+    const float *feature_buf = coordinate_buffer + channel * kChannelSize;
+    float local_max = feature_buf[0];
+
+    // Find local max
+    for (int i = 1; i < kChannelSize; ++i) {
+      if (feature_buf[i] > local_max) {
+        local_max = feature_buf[i];
+      }
+    }
+
+    // Compute softmax and weighted sum in a single pass
+    float softmax_sum = 0.0f;
+    float value = 0.0f;
+    float temp[kChannelSize];
+    for (int i = 0; i < kChannelSize; ++i) {
+      temp[i] = feature_buf[i] - local_max;
+      float exp_val = expf(temp[i]);
+      softmax_sum += exp_val;
+      value += static_cast<float>(i) * exp_val;
+    }
+    feature_values[channel] = value / softmax_sum;
+  }
+}
diff --git a/src/zm_memx_accl_yolov8.h b/src/zm_memx_accl_yolov8.h
new file mode 100644
index 000000000..66a9c3fec
--- /dev/null
+++ b/src/zm_memx_accl_yolov8.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include "zm_memx_yolo_core.h"
+#include <queue>
+#include <mutex>
+
+class YOLOv8 {
+public:
+    /** @brief Constructor for using official 80 classes COCO dataset. */
+    YOLOv8(size_t input_width, size_t input_height, size_t input_channel,
+           float confidence_thresh, float iou_thresh);
+
+    /** @brief Constructor for using a customized dataset. */
+    YOLOv8(size_t input_width, size_t input_height, size_t input_channel,
+           float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels);
+
+    ~YOLOv8();
+
+    /**
+     * @brief Pre-process the input image data for YOLOv8 model inference.
+     * @param rgb_data      Pointer to the input image data in RGB format.
+     * @param image_width   Width of the display image.
+     * @param image_height  Height of the display image.
+     * @param input_buffers Vector of pointers to the input buffers used by the accelerator for processing. Should be zero-initialized.
+     */
+    void PreProcess(uint8_t *rgb_data, int image_width, int image_height, std::vector<float *> input_buffers);
+
+    /**
+     * @brief Post-process the output data from the YOLOv8 model.
+     * @param output_buffers   Vector of pointers to the output buffers from the accelerator.
+     * @param result           Reference to the structure where the decoded bounding box results will be stored.
+     */
+    void PostProcess(std::vector<float *> output_buffers, YOLOv8Result &result);
+
+    /** @brief Clear the detection results. */
+    void ClearDetectionResults(YOLOv8Result &result);
+
+    /** @brief Setters and getters for the confidence threshold. */
+    void SetConfidenceThreshold(float confidence);
+    float GetConfidenceThreshold();
+
+    /** @brief Compute padding values for letterboxing from the display image. */
+    void ComputePadding(int disp_width, int disp_height);
+
+    /** @brief Ensure the input dimensions are valid for horizontal display images only.  */
+    bool IsHorizontalInput(int disp_width, int disp_height);
+
+private:
+    /** @brief Structure representing per-layer information of YOLOv8 output. */
+    struct LayerParams
+    {
+        uint8_t coordinate_ofmap_flow_id;
+        uint8_t confidence_ofmap_flow_id;
+        size_t width;
+        size_t height;
+        size_t ratio;
+        size_t coordinate_fmap_size;
+    };
+
+    /** @brief Initialization method to set up YOLOv8 model parameters. */
+    void Init(size_t accl_input_width, size_t accl_input_height, size_t accl_input_channel,
+              float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels);
+
+    /** @brief Helper methods for building detections from model output. */
+    void GetDetection(std::queue<BBox> &bounding_boxes, int layer_id, float *confidence_buffer,
+                        float *coordinate_buffer, int row, int col, float *confs_tmp);
+
+    /** @brief Helper methods for calculating bounding box parameters from feature values. */
+    void CalculateBboxParams(const float *feature_values, int layer_id, int row, int col,
+                             float &center_x, float &center_y, float &box_width, float &box_height);
+    void GetFeatureValues(const float *coordinate_buffer, float *feature_values);
+
+    static constexpr size_t kNumPostProcessLayers = 3;
+    struct LayerParams yolo_post_layers_[kNumPostProcessLayers];
+
+    // Model-specific parameters.
+    const char **class_labels_;
+    size_t class_count_;
+    size_t accl_input_width_;   // Input width to accelerator, obtained by dfp.
+    size_t accl_input_height_;  // Input height to accelerator, obtained by dfp.
+    size_t accl_input_channel_; // Input channel to accelerator, obtained by dfp.
+
+    // Confidence and IOU thresholds.
+    std::mutex confidence_mutex_;
+    float confidence_thresh_;
+    float iou_thresh_;
+
+    // Letterbox ratio and padding.
+    float letterbox_ratio_;
+    int letterbox_width_;
+    int letterbox_height_;
+    int padding_height_;
+    int padding_width_;
+    bool valid_input_;
+};
diff --git a/zoneminder-config.cmake b/zoneminder-config.cmake
index 82cfccdd9..84be278d9 100644
--- a/zoneminder-config.cmake
+++ b/zoneminder-config.cmake
@@ -10,6 +10,7 @@
 #cmakedefine HAVE_EXECINFO_H 1
 #cmakedefine HAVE_UNTETHER_H 1
 #cmakedefine HAVE_MEMX_H 1
+#cmakedefine HAVE_MX_ACCL_H 1
 #cmakedefine HAVE_QUADRA 1
 #cmakedefine HAVE_UCONTEXT_H 1
 #cmakedefine HAVE_SYS_SENDFILE_H 1