diff --git a/CMakeLists.txt b/CMakeLists.txt index f7a75ff7b..b70226b43 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,12 +32,12 @@ endif() #set(CMAKE_INSTALL_ALWAYS ON) # Default CLFAGS and CXXFLAGS: -set(CMAKE_C_FLAGS_RELEASE "-O2") -set(CMAKE_CXX_FLAGS_RELEASE "-O2") -set(CMAKE_C_FLAGS_DEBUG "-g") -set(CMAKE_CXX_FLAGS_DEBUG "-g") -set(CMAKE_C_FLAGS_OPTIMISED "-O3") -set(CMAKE_CXX_FLAGS_OPTIMISED "-O3") +set(CMAKE_C_FLAGS_RELEASE "-O2 -fopenmp") +set(CMAKE_CXX_FLAGS_RELEASE "-O2 -fopenmp") +set(CMAKE_C_FLAGS_DEBUG "-g -fopenmp") +set(CMAKE_CXX_FLAGS_DEBUG "-g -fopenmp") +set(CMAKE_C_FLAGS_OPTIMISED "-O3 -fopenmp") +set(CMAKE_CXX_FLAGS_OPTIMISED "-O3 -fopenmp") set(CMAKE_INCLUDE_CURRENT_DIR ON) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") set(CMAKE_CXX_STANDARD 17) @@ -357,10 +357,21 @@ endif() find_file(HAVE_MEMX_H memx.h /usr/include/memx) if(HAVE_MEMX_H) - find_library(MEMX_LIBRARIES mx_accl) + find_library(MEMX_LIBRARIES memx) if(MEMX_LIBRARIES) - set(optlibsfound "${optlibsfound} mx_accl") + set(optlibsfound "${optlibsfound} memx") list(APPEND ZM_BIN_LIBS "${MEMX_LIBRARIES}") + else() + set(optlibsnotfound "${optlibsnotfound} memx") + endif() +endif() + +find_file(HAVE_MX_ACCL_H MxAccl.h /usr/include/memx/accl) +if(HAVE_MX_ACCL_H) + find_library(MX_ACCL_LIBRARIES mx_accl) + if(MX_ACCL_LIBRARIES) + set(optlibsfound "${optlibsfound} mx_accl") + list(APPEND ZM_BIN_LIBS "${MX_ACCL_LIBRARIES}") else() set(optlibsnotfound "${optlibsnotfound} mx_accl") endif() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 81a9dd9a6..acebfa370 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -37,6 +37,9 @@ set(ZM_BIN_SRC_FILES zm_libvnc_camera.cpp zm_local_camera.cpp zm_memx.cpp + zm_mx_accl.cpp + zm_memx_yolo_core.cpp + zm_memx_yolov8.cpp zm_monitor.cpp zm_monitor_monitorlink.cpp zm_monitor_rtsp2web.cpp diff --git a/src/zm_ai_server.cpp b/src/zm_ai_server.cpp index 86af25d3b..63e75544c 100644 --- a/src/zm_ai_server.cpp +++ b/src/zm_ai_server.cpp @@ -87,11 +87,17 @@ void Usage() { Quadra quadra; #endif +#undef HAVE_MEMX_H #if HAVE_MEMX_H #include "zm_memx.h" MemX *memx; #endif +#if HAVE_MX_ACCL_H + #include "zm_mx_accl.h" + MxAccl *mx_accl; +#endif + int main(int argc, char *argv[]) { self = argv[0]; @@ -172,10 +178,12 @@ int main(int argc, char *argv[]) { return 0; } #endif + #if HAVE_MEMX_H Debug(1, "Including MemX"); memx = new MemX(); - if (!memx->setup("yolov5", "/var/cache/zoneminder/models/YOLO_v5_nano_leaky_416_416_3_tensorflow.dfp")) { + if (!memx->setup("yolov8", "/var/cache/zoneminder/models/YOLO_v8_nano_640_640_3_onnx.dfp")) { + //if (!memx->setup("yolov5", "/var/cache/zoneminder/models/YOLO_v5_nano_leaky_416_416_3_tensorflow.dfp")) { delete memx; memx = nullptr; return 0; @@ -183,6 +191,17 @@ int main(int argc, char *argv[]) { #else Debug(1, "Not Including MemX"); #endif +#if HAVE_MX_ACCL_H + Debug(1, "Including MemX_Accl"); + mx_accl = new MxAccl(); + if (!mx_accl->setup("yolov8", "/var/cache/zoneminder/models/YOLO_v8_nano_640_640_3_onnx.dfp")) { + delete mx_accl; + mx_accl = nullptr; + return 0; + } +#else + Debug(1, "Not Including MXACCL"); +#endif std::unordered_map> monitors; std::unordered_map threads; @@ -206,6 +225,9 @@ int main(int argc, char *argv[]) { #endif #if HAVE_MEMX_H , memx +#endif +#if HAVE_MX_ACCL_H + , mx_accl #endif ); } @@ -278,6 +300,13 @@ void AIThread::Inference() { sleep(1); } #endif +#ifdef HAVE_MX_ACCL_H + Debug(1, "Starting for mx_accl"); + while (!terminate_ and !( mx_accl_job = mx_accl->get_job() )) { + Warning("Waiting for job"); + sleep(1); + } +#endif #if HAVE_QUADRA #if !SOFT_DRAWBOX @@ -300,14 +329,13 @@ void AIThread::Inference() { while (!(terminate_ or zm_terminate)) { std::shared_ptr packet = nullptr; - Debug(1, "Getting lock"); + Debug(4, "Getting shm lock"); // Need to hold the lock because it guards shared_mem as well. std::unique_lock lck(mutex_); - Debug(1, "Having lock"); + Debug(4, "Having lock"); while (!(send_queue.size() or terminate_)) { - Debug(1, "Waiting"); condition_.wait(lck); - Debug(1, "Send queue size for monitor %d is %zu", monitor_->Id(), send_queue.size()); + Debug(4, "Send queue size for monitor %d is %zu", monitor_->Id(), send_queue.size()); } if (terminate_) break; packet = send_queue.front(); @@ -315,25 +343,21 @@ void AIThread::Inference() { if (packet) { Monitor::SharedData *shared_data = monitor_->getSharedData(); Debug(4, "Sending image %d for monitor %d", packet->image_index, monitor_->Id()); -#ifdef HAVE_UNTETHER_H - speedai->send_image(job, packet->image); Image *ai_image = monitor_->GetAnalysisImage(packet->image_index); - nlohmann::json detections = speedai->receive_detections(job, monitor_->ObjectDetection_Object_Threshold()); - Debug(1, "detections %s", detections.dump().c_str()); - if (detections.size()) { - draw_boxes(packet->image, ai_image, detections, monitor_->LabelSize(), monitor_->LabelSize()); - //draw_boxes(drawbox_filter, drawbox_filter_ctx, packet->image, ai_image, detections, monitor_->LabelSize(), monitor_->LabelSize()); - } else { - ai_image->Assign(*packet->image); - } +#ifdef HAVE_UNTETHER_H + speedai->send_image(job, packet->image); + nlohmann::json detections = speedai->receive_detections(job, monitor_->ObjectDetection_Object_Threshold()); #endif #ifdef HAVE_MEMX_H memx->send_image(memx_job, packet->image); - - Image *ai_image = monitor_->GetAnalysisImage(packet->image_index); nlohmann::json detections = memx->receive_detections(memx_job, monitor_->ObjectDetection_Object_Threshold()); +#endif +#ifdef HAVE_MX_ACCL_H + mx_accl->send_image(mx_accl_job, packet->image); + nlohmann::json detections = mx_accl->receive_detections(mx_accl_job, monitor_->ObjectDetection_Object_Threshold()); +#endif Debug(1, "detections %s", detections.dump().c_str()); if (detections.size()) { @@ -342,7 +366,6 @@ void AIThread::Inference() { } else { ai_image->Assign(*packet->image); } -#endif shared_data->last_analysis_index = packet->image_index; @@ -436,7 +459,7 @@ void AIThread::Run() { (send_queue.size() <= static_cast(image_buffer_count)) ) { image_index = shared_data->last_decoder_index % image_buffer_count; - Debug(3, "Doing SpeedAI on monitor %d. Decoder index is %d=%d Our index is %d=%d, queue %zu", + Debug(3, "Doing ai on monitor %d. Decoder index is %d=%d Our index is %d=%d, queue %zu", monitor_->Id(), decoder_image_count, shared_data->last_decoder_index, analysis_image_count, image_index, send_queue.size()); @@ -469,7 +492,7 @@ void AIThread::Run() { } } else { - Debug(4, "Not Doing SpeedAI on monitor %d. Decoder count is %d index %d Our count is %d, last_index is %d, index %d", + Debug(4, "Not Doing ai on monitor %d. Decoder count is %d index %d Our count is %d, last_index is %d, index %d", monitor_->Id(), decoder_image_count, shared_data->last_decoder_index, shared_data->analysis_image_count, shared_data->last_analysis_index, image_index); @@ -651,6 +674,9 @@ AIThread::AIThread(const std::shared_ptr monitor #endif #if HAVE_MEMX_H , MemX *p_memx +#endif +#if HAVE_MX_ACCL_H + , MxAccl *p_mx_accl #endif ) : monitor_(monitor), terminate_(false) @@ -660,6 +686,9 @@ AIThread::AIThread(const std::shared_ptr monitor #if HAVE_MEMX_H , memx(p_memx) #endif +#if HAVE_MX_ACCL_H + , mx_accl(p_mx_accl) +#endif #if HAVE_QUADRA ,drawbox_filter(nullptr) #endif diff --git a/src/zm_ai_server.h b/src/zm_ai_server.h index e258902d9..cb1d992ed 100644 --- a/src/zm_ai_server.h +++ b/src/zm_ai_server.h @@ -24,19 +24,23 @@ extern "C" { #if HAVE_MEMX_H #include "zm_memx.h" #endif +#if HAVE_MX_ACCL_H + #include "zm_mx_accl.h" +#endif class Monitor; class AIThread { public: - explicit AIThread(const std::shared_ptr monitor #if HAVE_UNTETHER_H - , SpeedAI *speedai + explicit AIThread(const std::shared_ptr monitor, SpeedAI *speedai); #endif #if HAVE_MEMX_H - , MemX *memx + explicit AIThread(const std::shared_ptr monitor, MemX *memx); +#endif +#if HAVE_MX_ACCL_H + explicit AIThread(const std::shared_ptr monitor, MxAccl *memx); #endif - ); ~AIThread(); AIThread(AIThread &rhs) = delete; AIThread(AIThread &&rhs) = delete; @@ -62,6 +66,10 @@ class AIThread { MemX *memx; MemX::Job *memx_job; #endif +#if HAVE_MX_ACCL_H + MxAccl *mx_accl; + MxAccl::Job *mx_accl_job; +#endif #if HAVE_QUADRA Quadra::filter_worker *drawbox_filter; #endif diff --git a/src/zm_memx.cpp b/src/zm_memx.cpp index 7b1bd0314..9d0db1ad8 100644 --- a/src/zm_memx.cpp +++ b/src/zm_memx.cpp @@ -36,7 +36,7 @@ #define USE_THREAD 1 -#define BATCH_SIZE 10 +#define BATCH_SIZE 1 #ifdef HAVE_MEMX_H static const char *coco_classes[] = {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", @@ -51,14 +51,17 @@ static const char *coco_classes[] = {"person", "bicycle", "car", "motorcycle", " "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"}; MemX::MemX() : - accl(), + model_id(0), + input_port(0), + output_port(0), + obj_thresh(0.25), + nms_thresh(0.45), terminate_(false), batchSize(0), inSize(0), outSize(0), count(0) { - image_size = av_image_get_buffer_size(AV_PIX_FMT_RGB24, MODEL_WIDTH, MODEL_HEIGHT, 32); } MemX::~MemX() { @@ -68,6 +71,7 @@ MemX::~MemX() { delete jobs.front(); jobs.pop_front(); } + memx_close(model_id); } bool MemX::setup( @@ -77,15 +81,35 @@ bool MemX::setup( // Load and launch module_ Debug(1, "MemX: Loading model %s", model_file.c_str()); - std::vector device_ids = {0}; - std::array use_model_shape = {false, false}; + //std::vector device_ids = {0}; + //std::array use_model_shape = {false, false}; + uint8_t model_id = 0; // Create the accelerator object and load the DFP model - accl = new MX::Runtime::MxAccl(filesystem::path(model_file), device_ids, use_model_shape); + // bind MPU device group 0 as MX3 to model 0 + memx_status status = memx_open(model_id, 0, MEMX_DEVICE_CASCADE_PLUS); + if (status == 0) { + Debug(1, "Success opening MX3"); + } else { + Error("Failure openping MX3"); + return false; + } - //int dfp_tag = accl.connect_dfp(model_file.c_str()); - model_info = accl->get_model_info(0); - batchSize = 1; // only 1 issupported + // download weight memory and 1st model within DFP file to device + status = memx_download_model(model_id, model_file.c_str(), 0, MEMX_DOWNLOAD_TYPE_WTMEM_AND_MODEL); + + if (status == 0) { + // read back flow 0 (port 0) input feature map shape for debug information + memx_get_ifmap_size(0, 0, &input_height, &input_width, &input_depth, &input_channels, &input_format); + // for example: shape = (224, 224, 1, 3), format = 1 + Debug(1, "ifmap shape = (%d, %d, %d, %d), format = %d", input_height, input_width, input_depth, input_channels, input_format); + memx_get_ofmap_size(0, 0, &output_height, &output_width, &output_depth, &output_channels, &output_format); + Debug(1, "ofmap shape = (%d, %d, %d, %d), format = %d", output_height, output_width, output_depth, output_channels, output_format); + image_size = av_image_get_buffer_size(AV_PIX_FMT_RGB24, input_width, input_height, 0); + } + // Set confidence threshold and process detection results + yolov8_handle = std::make_unique(input_width, input_height, input_channels, 0, 0); + yolov8_handle->SetConfidenceThreshold(obj_thresh); #if USE_THREAD thread_ = std::thread(&MemX::Run, this); #endif @@ -93,40 +117,143 @@ bool MemX::setup( } void MemX::Run() { + + memx_status status = memx_set_stream_enable(model_id, 0); + while (!(terminate_ or zm_terminate)) { - Debug(1, "MemX locking, queue size %zu", send_queue.size()); - std::unique_lock lck(mutex_); - while (send_queue.size() < BATCH_SIZE and ! zm_terminate) { - Debug(1, "MemX waiting, queue size %zu", send_queue.size()); - condition_.wait(lck); - } - if (send_queue.size() >= BATCH_SIZE) { - Debug(1, " batch"); - for (int i = BATCH_SIZE; i > 0 ; i--) { - Job *job = send_queue.front(); + Job *job = nullptr; + { + Debug(1, "MemX locking, queue size %zu", send_queue.size()); + std::unique_lock lck(mutex_); + while (send_queue.size() < BATCH_SIZE and ! zm_terminate) { + Debug(1, "MemX waiting, queue size %zu", send_queue.size()); + condition_.wait(lck); + } + if (send_queue.size() >= BATCH_SIZE) { + Debug(1, " batch"); + job = send_queue.front(); send_queue.pop_front(); - std::unique_lock job_lck(job->mutex_); + } else { + Debug(1, "Not batch"); + } + } // end scope for lock - Debug(3, "MemX::enqueue"); - SystemTimePoint starttime = std::chrono::system_clock::now(); - int err = 0; - if (err) { -//Error("Failed enqueue %s", uai_err_string(err)); - } else { - SystemTimePoint endtime = std::chrono::system_clock::now(); - if (endtime - starttime > Milliseconds(40)) { - Warning("MemX enqueue is too slow: %.3f seconds", FPSeconds(endtime - starttime).count()); - } else { - Debug(3, "MemX enqueue took: %.3f seconds", FPSeconds(endtime - starttime).count()); - } - } - job->notify(); - } // end foreach job in batch - } else { - Debug(1, "Not batch"); - } // end if BATCH + if (job) { + job->lock(); + inference(job); + + Debug(1, "notifying"); + job->notify(); + Debug(1, "unlocking"); + job->unlock(); + Debug(1, "Done notifying"); + job = nullptr; + } // end if job } // end while forever + + status = memx_set_stream_disable(model_id, 0); +} + +#if 0 +// Function to process model output and get bounding boxes +nlohmann::json MemX::get_detections(float *ofmap, int num_boxes, const cv::Mat &inframe, + std::vector &filtered_boxes, + std::vector> &filtered_mask_coefs) +{ + + nlohmann::json coco_predictions = nlohmann::json::array(); + + std::vector all_boxes; + std::vector all_scores; + std::vector> mask_coefs; + + // Iterate through the model outputs + for (int i = 0; i < num_boxes; ++i) + { + float *feature_ptr = ofmap_t_ptr + i * num_features; + + // get best class information + float confidence; + int class_id; + get_best_class_info(feature_ptr, confidence, class_id); + + if (confidence > conf_thresh) + { + // Extract and scale the bounding box coordinates + float x0 = feature_ptr[0]; + float y0 = feature_ptr[1]; + float w = feature_ptr[2]; + float h = feature_ptr[3]; + // Convert boxes from center format (cxcywh) to corner format (xyxy) + int x1 = static_cast(x0 - 0.5f * w); + int y1 = static_cast(y0 - 0.5f * h); + int x2 = x1 + w; + int y2 = y1 + h; + + // Rescale boxes to original image size + x1 = (x1 - pad_w) / scale_ratio; + x2 = (x2 - pad_w) / scale_ratio; + y1 = (y1 - pad_h) / scale_ratio; + y2 = (y2 - pad_h) / scale_ratio; + + // Clamp box boundaries to image dimensions + x1 = std::max(0, std::min(x1, ori_image_width - 1)); + x2 = std::max(0, std::min(x2, ori_image_width - 1)); + y1 = std::max(0, std::min(y1, ori_image_height - 1)); + y2 = std::max(0, std::min(y2, ori_image_height - 1)); + + // Add detected box to the list + all_boxes.emplace_back(x1, y1, x2, y2, class_id, confidence); + all_scores.emplace_back(confidence); + cv_boxes.emplace_back(x1, y1, x2 - x1, y2 - y1); + + // Add detected mask coefficient to the list + std::vector mask_coef(feature_ptr + 4 + num_class, feature_ptr + num_features); + mask_coefs.push_back(mask_coef); + } + } + o // Apply Non-Maximum Suppression (NMS) to remove overlapping boxes + std::vector nms_result; + if (!cv_boxes.empty()) { + cv::dnn::NMSBoxes(cv_boxes, all_scores, conf_thresh, nms_thresh, nms_result); + for (int idx : nms_result) + { + filtered_boxes.push_back(all_boxes[idx]); + filtered_mask_coefs.push_back(mask_coefs[idx]); + } + } +} + +#endif + + + +memx_status MemX::inference(Job *job) { + const int timeout = 200; // 200 ms + Debug(3, "MemX::enqueue"); + SystemTimePoint starttime = std::chrono::system_clock::now(); + memx_status status = memx_stream_ifmap(model_id, input_port, static_cast(job->ifmap.data()), timeout); + + if (status) { + Error("Failed enqueue %d", status); + return status; + } else { + SystemTimePoint endtime = std::chrono::system_clock::now(); + if (endtime - starttime > Milliseconds(40)) { + Warning("MemX enqueue is too slow: %.3f seconds", FPSeconds(endtime - starttime).count()); + } else { + Debug(3, "MemX enqueue took: %.3f seconds", FPSeconds(endtime - starttime).count()); + } + } + + // 3. Read output feature map from device after inference + if (0 == status) { + Debug(1, "Reading ofmap"); + status = memx_stream_ofmap(model_id, output_port, static_cast(job->ofmap.data()), timeout); + Debug(1, "Status from reading ofmap %d", status); + } + return status; } MemX::Job * MemX::send_image(Job *job, Image *image) { @@ -145,26 +272,65 @@ MemX::Job * MemX::send_packet(Job *job, std::shared_ptr packet) { } MemX::Job * MemX::get_job() { - Job *job = new Job(accl); + Job *job = new Job(); + AVFrame *frame = job->scaled_frame.get(); + + job->scaled_frame->width = input_width; + job->scaled_frame->height = input_height; + job->scaled_frame->format = AV_PIX_FMT_RGB24; + + Debug(1, "sw scale: scaled frame data 0x%lx, linesize %d/%d/%d/%d, height %d", + (unsigned long)frame->data, frame->linesize[0], frame->linesize[1], + frame->linesize[2], frame->linesize[3], frame->height); + +#if 1 if (av_frame_get_buffer(job->scaled_frame.get(), 32)) { Error("cannot allocate scaled frame buffer"); return nullptr; } +#endif + + job->ifmap.resize(input_width*input_width*input_depth*input_channels); + job->ofmap.resize(output_width*output_width*output_depth*output_channels); + +#if 0 + AVBufferRef *ref = av_buffer_create(reinterpret_cast(&job->ifmap), + input_width*input_width*input_depth*input_channels, + dont_free, /* Free callback */ + nullptr, /* opaque */ + 0 /* flags */ + ); + if (!ref) { + Warning("Failed to create av_buffer"); + } + job->scaled_frame->buf[0] = ref; + + int rc_size = av_image_fill_arrays( + job->scaled_frame->data, job->scaled_frame->linesize, + reinterpret_cast(&job->ifmap), AV_PIX_FMT_RGB24, input_width, input_height, + 0 //alignment + ); + if (rc_size < 0) { + Error("Problem setting up data pointers into image %s", av_make_error_string(rc_size).c_str()); + } else { + Debug(1, "Image.Populate frame rc/size %d %dx%d", rc_size, job->scaled_frame->width, job->scaled_frame->height); + } +#endif std::unique_lock lck(mutex_); - jobs.push_back(job); return job; -} +} // end MemX::Job * MemX::get_job() MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) { count++; Debug(1, "MemX::send_frame %d", count); SystemTimePoint starttime = std::chrono::system_clock::now(); + job->sw_scale_ctx = sws_getCachedContext(job->sw_scale_ctx, avframe->width, avframe->height, static_cast(avframe->format), - MODEL_WIDTH, MODEL_HEIGHT, AV_PIX_FMT_RGB24, + input_width, input_height, AV_PIX_FMT_RGB24, SWS_BICUBIC, nullptr, nullptr, nullptr); //Debug(1, "Start scale"); @@ -176,8 +342,53 @@ MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) { avframe->linesize[2], avframe->linesize[3], avframe->height, job->scaled_frame->linesize[0]); return nullptr; } - job->m_width_rescale = ((float)MODEL_WIDTH / (float)avframe->width); - job->m_height_rescale = ((float)MODEL_HEIGHT / (float)avframe->height); + job->m_width_rescale = ((float)input_width / (float)avframe->width); + job->m_height_rescale = ((float)input_height / (float)avframe->height); + + std::vector accl_input_data; + accl_input_data.resize(input_width*input_height*input_depth*input_channels); + +#if 0 + yolov8_handle->PreProcess(job->scaled_frame->data[0], input_width, input_height, accl_input_data); + +#else + int offset = 0; //padding_height_ * input_width_ * 3; + float *buffer_ptr = job->ifmap.data() + offset; // YOLOv8 has 1 input + uint8_t *input_buffer_ptr = job->scaled_frame->data[0]; + + for (int row = 0; row < input_height; ++row) { + for (int col = 0; col < input_width; ++col) { +#pragma omp simd + for(int i=0; i < 3; i++) { + buffer_ptr[0] = input_buffer_ptr[0] / (float)255.0; // red + buffer_ptr[1] = input_buffer_ptr[1] / (float)255.0; // green + buffer_ptr[2] = input_buffer_ptr[2] / (float)255.0; // blue + } + buffer_ptr += 3; + input_buffer_ptr += 3; + } + } +#endif + +#if 1 + inference(job); + YOLOv8Result result; + //yolov8_handle->PostProcess(job->ofmap, result); +#else + // Prevents processing? + job->lock(); + //std::unique_lock job_lck(job->mutex_); + { + Debug(1, "Locking"); + std::unique_lock lck(mutex_); + send_queue.push_back(job); + condition_.notify_all(); + } + Debug(1, "Waiting for inference"); + job->wait(); + Debug(1, "Done Waiting"); +#endif + starttime = std::chrono::system_clock::now(); SystemTimePoint endtime = std::chrono::system_clock::now(); if (endtime - starttime > Milliseconds(60)) { @@ -190,10 +401,11 @@ MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) { const nlohmann::json MemX::receive_detections(Job *job, float object_threshold) { nlohmann::json coco_object; - coco_object = convert_predictions_to_coco_format(job->predictions_buffer, job->m_width_rescale, job->m_height_rescale, object_threshold); + coco_object = convert_predictions_to_coco_format(job->ofmap, job->m_width_rescale, job->m_height_rescale, object_threshold); return coco_object; } +#if 0 nlohmann::json MemX::convert_predictions_to_coco_format( const std::vector& predictions, float m_width_rescale, @@ -238,5 +450,6 @@ nlohmann::json MemX::convert_predictions_to_coco_format( } return coco_predictions; } +#endif #endif diff --git a/src/zm_memx.h b/src/zm_memx.h index fbb6912f2..5598480b8 100644 --- a/src/zm_memx.h +++ b/src/zm_memx.h @@ -20,19 +20,22 @@ extern "C" { } #ifdef HAVE_MEMX_H -//#include "memx/memx.h" -#include - -// FIXME move model stuff to it's ownfile -#define MODEL_WIDTH 640 -#define MODEL_HEIGHT 640 -// * 6; // 256 boxes, each with 6 elements [l, t, r, b, class, score] -#define NUM_NMS_PREDICTIONS 256 +#include "memx/memx.h" +#include "zm_memx_yolov8.h" class MemX { private: - MX::Runtime::MxAccl *accl; - MX::Types::MxModelInfo model_info; + std::unique_ptr yolov8_handle; + + uint8_t model_id; + uint8_t input_port; + uint8_t output_port; + + float obj_thresh = 0.25; + float nms_thresh = 0.45; + + int32_t input_height, input_width, input_depth, input_channels, input_format; + int32_t output_height, output_width, output_depth, output_channels, output_format; std::mutex mutex_; std::condition_variable condition_; @@ -50,7 +53,6 @@ class MemX { public: class Job { private: - MX::Runtime::MxAccl *m_accl; public: int index; @@ -58,13 +60,15 @@ class MemX { float m_width_rescale; float m_height_rescale; SwsContext *sw_scale_ctx; - std::vector predictions_buffer; + + std::vector ifmap; + std::vector ofmap; + std::mutex mutex_; std::unique_lock lck_; std::condition_variable condition_; - Job(MX::Runtime::MxAccl *p_accl) : - m_accl(p_accl), + Job() : index(0), m_width_rescale(1.0), m_height_rescale(1.0), @@ -72,10 +76,6 @@ class MemX { lck_(mutex_, std::defer_lock) { scaled_frame = av_frame_ptr(zm_av_frame_alloc()); - scaled_frame->width = MODEL_WIDTH; - scaled_frame->height = MODEL_HEIGHT; - scaled_frame->format = AV_PIX_FMT_RGB24; - predictions_buffer.resize(NUM_NMS_PREDICTIONS*6); }; ~Job() { if (sw_scale_ctx) { @@ -85,14 +85,12 @@ class MemX { void setFrame(AVFrame *frame) { } Job(Job &&in) : - m_accl(in.m_accl), index(in.index), scaled_frame(std::move(in.scaled_frame)), m_width_rescale(in.m_width_rescale), m_height_rescale(in.m_height_rescale) { Debug(1, "In move"); - //in.scaled_frame = nullptr; } void lock() { lck_.lock(); @@ -117,6 +115,7 @@ class MemX { Job * send_packet(Job *job, std::shared_ptr); Job * send_image(Job *job, Image *image); Job * send_frame(Job *job, AVFrame *); + memx_status inference(Job *job); const nlohmann::json receive_detections(Job *job, float threshold); nlohmann::json convert_predictions_to_coco_format(const std::vector& predictions, float, float, float threshold); diff --git a/src/zm_memx_accl_yolov8.cpp b/src/zm_memx_accl_yolov8.cpp new file mode 100644 index 000000000..2b836a2b6 --- /dev/null +++ b/src/zm_memx_accl_yolov8.cpp @@ -0,0 +1,313 @@ +#include "zm_memx_yolov8.h" +#include "zm_logger.h" +#include + +YOLOv8::YOLOv8(size_t input_width, size_t input_height, size_t input_channel, float confidence_thresh, float iou_thresh) { + Init(input_width, input_height, input_channel, confidence_thresh, iou_thresh, COCO_CLASS_NUMBER, COCO_NAMES); +} + +YOLOv8::YOLOv8(size_t input_width, size_t input_height, size_t input_channel, + float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels) +{ + Init(input_width, input_height, input_channel, confidence_thresh, iou_thresh, class_count, class_labels); +} + +void YOLOv8::Init(size_t accl_input_width, size_t accl_input_height, size_t accl_input_channel, + float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels) +{ + if (confidence_thresh < 0.0f || confidence_thresh > 1.0f || iou_thresh < 0.0f || iou_thresh > 1.0f) { + throw std::invalid_argument("Confidence and IOU threshold must be between 0.0 and 1.0."); + } + + if (accl_input_width != accl_input_height) { + throw std::invalid_argument("The YOLOv8 api requires model input to have equal width and height dimensions." + "Currently, inconsistent input dimensions are not supported."); + } + + accl_input_width_ = accl_input_width; + accl_input_height_ = accl_input_height; + accl_input_channel_ = accl_input_channel; + confidence_thresh_ = confidence_thresh; + iou_thresh_ = iou_thresh; + class_labels_ = class_labels; + class_count_ = class_count; + valid_input_ = false; + + yolo_post_layers_[0] = { + .coordinate_ofmap_flow_id = 0, + .confidence_ofmap_flow_id = 1, + .width = accl_input_width_ / 8, // L0_HW, 640 / 8 = 80 + .height = accl_input_height_ / 8, // L0_HW, 640 / 8 = 80 + .ratio = 8, + .coordinate_fmap_size = 64, + }; + + yolo_post_layers_[1] = { + .coordinate_ofmap_flow_id = 2, + .confidence_ofmap_flow_id = 3, + .width = accl_input_width_ / 16, // L1_HW, 640 / 16 = 40 + .height = accl_input_height_ / 16, // L1_HW, 640 / 16 = 40 + .ratio = 16, + .coordinate_fmap_size = 64, + }; + + yolo_post_layers_[2] = { + .coordinate_ofmap_flow_id = 4, + .confidence_ofmap_flow_id = 5, + .width = accl_input_width_ / 32, // L2_HW, 640 / 32 = 20 + .height = accl_input_height_ / 32, // L2_HW, 640 / 32 = 20 + .ratio = 32, + .coordinate_fmap_size = 64, + }; +} + +YOLOv8::~YOLOv8() +{ +} + +float YOLOv8::GetConfidenceThreshold() { + std::lock_guard guard(confidence_mutex_); + return confidence_thresh_; +} + +void YOLOv8::SetConfidenceThreshold(float confidence) { + if (confidence < 0.0f || confidence > 1.0f) { + throw std::invalid_argument("Confidence threshold must be between 0.0 and 1.0."); + } + + std::lock_guard guard(confidence_mutex_); + confidence_thresh_ = confidence; +} + +bool YOLOv8::IsHorizontalInput(int disp_width, int disp_height) { + if (disp_height > disp_width) { + Error("Invalid display image: only horizontal images are supported."); + return false; + } + return true; +} + +void YOLOv8::ComputePadding(int disp_width, int disp_height) { + if (!IsHorizontalInput(disp_width, disp_height)) + return; + + // accl_input_height_ is equal to accl_input_width_ + letterbox_ratio_ = (float)accl_input_height_ / mxutil_max(disp_width, disp_height); + + letterbox_width_ = disp_width * letterbox_ratio_; + letterbox_height_ = disp_height * letterbox_ratio_; + + padding_width_ = (accl_input_width_ - letterbox_width_) / 2; + padding_height_ = (accl_input_height_ - letterbox_height_) / 2; + + valid_input_ = true; +} + +void YOLOv8::PreProcess(uint8_t *rgb_data, int image_width, int image_height, std::vector input_buffers) { + if (!valid_input_) { + throw std::runtime_error("Make sure to call ComputePadding() before further processing."); + } + + int offset = padding_height_ * accl_input_width_ * 3; + float *buffer_ptr = input_buffers[0] + offset; // YOLOv8 has 1 input + +#if 0 + cv::Mat src_img = cv::Mat(image_height, image_width, CV_8UC3, rgb_data); + cv::Mat resized_img; + cv::resize(src_img, resized_img, cv::Size(letterbox_width_, letterbox_height_), 0, 0, cv::INTER_LINEAR); + + + for (int row = 0; row < resized_img.rows; ++row) { + const cv::Vec3b *pixel_row = resized_img.ptr(row); + + for (int col = 0; col < resized_img.cols; ++col) { + const cv::Vec3b &pixelValue = pixel_row[col]; + +#pragma omp simd + for(int i=0; i < 3; i++) { + buffer_ptr[0] = pixelValue[0] / (float)255.0; // red + buffer_ptr[1] = pixelValue[1] / (float)255.0; // green + buffer_ptr[2] = pixelValue[2] / (float)255.0; // blue + } + buffer_ptr += 3; + } + } +#else + + uint8_t *input_buffer_ptr = rgb_data; + + for (int row = 0; row < image_height; ++row) { + for (int col = 0; col < image_width; ++col) { +#pragma omp simd + for(int i=0; i < 3; i++) { + buffer_ptr[0] = input_buffer_ptr[0] / (float)255.0; // red + buffer_ptr[1] = input_buffer_ptr[1] / (float)255.0; // green + buffer_ptr[2] = input_buffer_ptr[2] / (float)255.0; // blue + } + buffer_ptr += 3; + input_buffer_ptr += 3; + } + } +#endif +} + + +void YOLOv8::GetDetection( + std::queue &bboxes, + int layer_id, + float *confidence_cell_buf, + float *coordinate_cell_buf, + int row, int col, float *confs_tmp) +{ + // process confidence score + float best_label_score = -1.0; + int best_label; + + // Force use of SIMD to calculate sigmoid. + // + // The space tradeoff of the temp result buffer + // is more than worth the speedup / CPU usage reduction + // (~75% CPU [down from 90% due to other changes] down to 59% CPU on my laptop) +#pragma omp simd + for (size_t label = 0; label < class_count_; label++) { + confs_tmp[label] = mxutil_prepost_sigmoid(confidence_cell_buf[label]); + } + // no way to avoid O(n) here, though + for (size_t label = 0; label < class_count_; label++) { + if (confs_tmp[label] > best_label_score) { + best_label_score = confs_tmp[label]; + best_label = label; + } + } + + if (best_label_score < confidence_thresh_) + return; + + std::vector feature_value{}; + + for (int channel = 0; channel < 4; channel++) { // split 64 into 4*16 + float value = 0.0; + float *feature_buf = coordinate_cell_buf + channel * 16; + float softmax_sum = 0.0; + float local_max = feature_buf[0]; + + // apply softmax and weighted sum + for (int i = 1; i < 16; i++) { + if (feature_buf[i] > local_max) + local_max = feature_buf[i]; + } + + // more SIMD hints +#pragma omp simd reduction(+:softmax_sum) + for (int i = 0; i < 16; i++) { + softmax_sum += expf(feature_buf[i] - local_max); + } + + // more SIMD hints +#pragma omp simd reduction(+:value) + for (int i = 0; i < 16; i++) { + value += ((float)i * (float)(expf(feature_buf[i] - local_max) / softmax_sum)); + } + feature_value.push_back(value); + } + + float center_x, center_y, w, h; + + center_x = (feature_value[2] - feature_value[0] + 2 * (0.5 + ((float)col))) * 0.5 * yolo_post_layers_[layer_id].ratio; + center_y = (feature_value[3] - feature_value[1] + 2 * (0.5 + ((float)row))) * 0.5 * yolo_post_layers_[layer_id].ratio; + w = (feature_value[2] + feature_value[0]) * yolo_post_layers_[layer_id].ratio; + h = (feature_value[3] + feature_value[1]) * yolo_post_layers_[layer_id].ratio; + + // printf("[Layer%d] (%0.3f) %s\t: %.2f\t,%.2f\t,%.2f\t,%.2f\t]\n", layer_id, best_label_score, class_labels_[best_label], center_x, center_y, w, h); + + float min_x = mxutil_max(center_x - 0.5 * w, .0); + float min_y = mxutil_max(center_y - 0.5 * h, .0); + float max_x = mxutil_min(center_x + 0.5 * w, accl_input_width_); + float max_y = mxutil_min(center_y + 0.5 * h, accl_input_height_); + + BBox bbox(best_label, best_label_score, min_x, min_y, max_x, max_y); + + non_maximum_suppression(bboxes, bbox, iou_thresh_); +} + +void YOLOv8::PostProcess(std::vector output_buffers, YOLOv8Result &result) { + if (!valid_input_) { + throw std::runtime_error("Make sure to call ComputePadding() before further processing."); + } + + if (output_buffers.empty()) { + throw std::invalid_argument("output_buffers cannot be null."); + } + + float *confs_tmp = new float[class_count_]; + + for (size_t layer_id = 0; layer_id < kNumPostProcessLayers; ++layer_id) { + const auto &layer = yolo_post_layers_[layer_id]; + const int confidence_floats_per_row = (layer.width * class_count_); // 80 x 80 + const int coordinate_floats_per_row = (layer.width * layer.coordinate_fmap_size); // 80 x 64 + + const int conf_id = layer.confidence_ofmap_flow_id; + const int coord_id = layer.coordinate_ofmap_flow_id; + + float *confidence_base = output_buffers[conf_id]; + float *coordinate_base = output_buffers[coord_id]; + + if (!confidence_base || !coordinate_base) { + throw std::invalid_argument("One or more output buffers are null."); + } + + for (size_t row = 0; row < layer.height; row++) { + float *confidence_row_buf = confidence_base + row * confidence_floats_per_row; + float *coordinate_row_buf = coordinate_base + row * coordinate_floats_per_row; + + for (size_t col = 0; col < layer.width; col++) { + float *confidence_cell_buf = confidence_row_buf + col * class_count_; + float *coordinate_cell_buf = coordinate_row_buf + col * layer.coordinate_fmap_size; + GetDetection(result.bboxes, layer_id, confidence_cell_buf, coordinate_cell_buf, row, col, confs_tmp); + } + } + } + + delete [] confs_tmp; +} + +void YOLOv8::CalculateBboxParams(const float *feature_values, int layer_id, int row, int col, + float ¢er_x, float ¢er_y, float &box_width, float &box_height) +{ + const float ratio = yolo_post_layers_[layer_id].ratio; + + center_x = (feature_values[2] - feature_values[0] + 2.0f * (0.5f + static_cast(col))) * 0.5f * ratio; + center_y = (feature_values[3] - feature_values[1] + 2.0f * (0.5f + static_cast(row))) * 0.5f * ratio; + box_width = (feature_values[2] + feature_values[0]) * ratio; + box_height = (feature_values[3] + feature_values[1]) * ratio; +} + +void YOLOv8::GetFeatureValues(const float *coordinate_buffer, float *feature_values) +{ + constexpr int kNumChannels = 4; + constexpr int kChannelSize = 16; + + for (int channel = 0; channel < kNumChannels; ++channel) { + const float *feature_buf = coordinate_buffer + channel * kChannelSize; + float local_max = feature_buf[0]; + + // Find local max + for (int i = 1; i < kChannelSize; ++i) { + if (feature_buf[i] > local_max) { + local_max = feature_buf[i]; + } + } + + // Compute softmax and weighted sum in a single pass + float softmax_sum = 0.0f; + float value = 0.0f; + float temp[kChannelSize]; + for (int i = 0; i < kChannelSize; ++i) { + temp[i] = feature_buf[i] - local_max; + float exp_val = expf(temp[i]); + softmax_sum += exp_val; + value += static_cast(i) * exp_val; + } + feature_values[channel] = value / softmax_sum; + } +} diff --git a/src/zm_memx_accl_yolov8.h b/src/zm_memx_accl_yolov8.h new file mode 100644 index 000000000..66a9c3fec --- /dev/null +++ b/src/zm_memx_accl_yolov8.h @@ -0,0 +1,95 @@ +#pragma once + +#include "zm_memx_yolo_core.h" +#include +#include + +class YOLOv8 { +public: + /** @brief Constructor for using official 80 classes COCO dataset. */ + YOLOv8(size_t input_width, size_t input_height, size_t input_channel, + float confidence_thresh, float iou_thresh); + + /** @brief Constructor for using a customized dataset. */ + YOLOv8(size_t input_width, size_t input_height, size_t input_channel, + float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels); + + ~YOLOv8(); + + /** + * @brief Pre-process the input image data for YOLOv8 model inference. + * @param rgb_data Pointer to the input image data in RGB format. + * @param image_width Width of the display image. + * @param image_height Height of the display image. + * @param input_buffers Vector of pointers to the input buffers used by the accelerator for processing. Should be zero-initialized. + */ + void PreProcess(uint8_t *rgb_data, int image_width, int image_height, std::vector input_buffers); + + /** + * @brief Post-process the output data from the YOLOv8 model. + * @param output_buffers Vector of pointers to the output buffers from the accelerator. + * @param result Reference to the structure where the decoded bounding box results will be stored. + */ + void PostProcess(std::vector output_buffers, YOLOv8Result &result); + + /** @brief Clear the detection results. */ + void ClearDetectionResults(YOLOv8Result &result); + + /** @brief Setters and getters for the confidence threshold. */ + void SetConfidenceThreshold(float confidence); + float GetConfidenceThreshold(); + + /** @brief Compute padding values for letterboxing from the display image. */ + void ComputePadding(int disp_width, int disp_height); + + /** @brief Ensure the input dimensions are valid for horizontal display images only. */ + bool IsHorizontalInput(int disp_width, int disp_height); + +private: + /** @brief Structure representing per-layer information of YOLOv8 output. */ + struct LayerParams + { + uint8_t coordinate_ofmap_flow_id; + uint8_t confidence_ofmap_flow_id; + size_t width; + size_t height; + size_t ratio; + size_t coordinate_fmap_size; + }; + + /** @brief Initialization method to set up YOLOv8 model parameters. */ + void Init(size_t accl_input_width, size_t accl_input_height, size_t accl_input_channel, + float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels); + + /** @brief Helper methods for building detections from model output. */ + void GetDetection(std::queue &bounding_boxes, int layer_id, float *confidence_buffer, + float *coordinate_buffer, int row, int col, float *confs_tmp); + + /** @brief Helper methods for calculating bounding box parameters from feature values. */ + void CalculateBboxParams(const float *feature_values, int layer_id, int row, int col, + float ¢er_x, float ¢er_y, float &box_width, float &box_height); + void GetFeatureValues(const float *coordinate_buffer, float *feature_values); + + static constexpr size_t kNumPostProcessLayers = 3; + struct LayerParams yolo_post_layers_[kNumPostProcessLayers]; + + // Model-specific parameters. + const char **class_labels_; + size_t class_count_; + size_t accl_input_width_; // Input width to accelerator, obtained by dfp. + size_t accl_input_height_; // Input height to accelerator, obtained by dfp. + size_t accl_input_channel_; // Input channel to accelerator, obtained by dfp. + + // Confidence and IOU thresholds. + std::mutex confidence_mutex_; + float confidence_thresh_; + float iou_thresh_; + + // Letterbox ratio and padding. + float letterbox_ratio_; + int letterbox_width_; + int letterbox_height_; + int padding_height_; + int padding_width_; + bool valid_input_; +}; diff --git a/zoneminder-config.cmake b/zoneminder-config.cmake index 82cfccdd9..84be278d9 100644 --- a/zoneminder-config.cmake +++ b/zoneminder-config.cmake @@ -10,6 +10,7 @@ #cmakedefine HAVE_EXECINFO_H 1 #cmakedefine HAVE_UNTETHER_H 1 #cmakedefine HAVE_MEMX_H 1 +#cmakedefine HAVE_MX_ACCL_H 1 #cmakedefine HAVE_QUADRA 1 #cmakedefine HAVE_UCONTEXT_H 1 #cmakedefine HAVE_SYS_SENDFILE_H 1