Rough in memx support

less_triggers
Isaac Connor 2025-08-08 14:57:03 -07:00
parent 1daf5a40f4
commit 3ddf7bd065
9 changed files with 769 additions and 97 deletions

View File

@ -32,12 +32,12 @@ endif()
#set(CMAKE_INSTALL_ALWAYS ON)
# Default CLFAGS and CXXFLAGS:
set(CMAKE_C_FLAGS_RELEASE "-O2")
set(CMAKE_CXX_FLAGS_RELEASE "-O2")
set(CMAKE_C_FLAGS_DEBUG "-g")
set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(CMAKE_C_FLAGS_OPTIMISED "-O3")
set(CMAKE_CXX_FLAGS_OPTIMISED "-O3")
set(CMAKE_C_FLAGS_RELEASE "-O2 -fopenmp")
set(CMAKE_CXX_FLAGS_RELEASE "-O2 -fopenmp")
set(CMAKE_C_FLAGS_DEBUG "-g -fopenmp")
set(CMAKE_CXX_FLAGS_DEBUG "-g -fopenmp")
set(CMAKE_C_FLAGS_OPTIMISED "-O3 -fopenmp")
set(CMAKE_CXX_FLAGS_OPTIMISED "-O3 -fopenmp")
set(CMAKE_INCLUDE_CURRENT_DIR ON)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
set(CMAKE_CXX_STANDARD 17)
@ -357,10 +357,21 @@ endif()
find_file(HAVE_MEMX_H memx.h /usr/include/memx)
if(HAVE_MEMX_H)
find_library(MEMX_LIBRARIES mx_accl)
find_library(MEMX_LIBRARIES memx)
if(MEMX_LIBRARIES)
set(optlibsfound "${optlibsfound} mx_accl")
set(optlibsfound "${optlibsfound} memx")
list(APPEND ZM_BIN_LIBS "${MEMX_LIBRARIES}")
else()
set(optlibsnotfound "${optlibsnotfound} memx")
endif()
endif()
find_file(HAVE_MX_ACCL_H MxAccl.h /usr/include/memx/accl)
if(HAVE_MX_ACCL_H)
find_library(MX_ACCL_LIBRARIES mx_accl)
if(MX_ACCL_LIBRARIES)
set(optlibsfound "${optlibsfound} mx_accl")
list(APPEND ZM_BIN_LIBS "${MX_ACCL_LIBRARIES}")
else()
set(optlibsnotfound "${optlibsnotfound} mx_accl")
endif()

View File

@ -37,6 +37,9 @@ set(ZM_BIN_SRC_FILES
zm_libvnc_camera.cpp
zm_local_camera.cpp
zm_memx.cpp
zm_mx_accl.cpp
zm_memx_yolo_core.cpp
zm_memx_yolov8.cpp
zm_monitor.cpp
zm_monitor_monitorlink.cpp
zm_monitor_rtsp2web.cpp

View File

@ -87,11 +87,17 @@ void Usage() {
Quadra quadra;
#endif
#undef HAVE_MEMX_H
#if HAVE_MEMX_H
#include "zm_memx.h"
MemX *memx;
#endif
#if HAVE_MX_ACCL_H
#include "zm_mx_accl.h"
MxAccl *mx_accl;
#endif
int main(int argc, char *argv[]) {
self = argv[0];
@ -172,10 +178,12 @@ int main(int argc, char *argv[]) {
return 0;
}
#endif
#if HAVE_MEMX_H
Debug(1, "Including MemX");
memx = new MemX();
if (!memx->setup("yolov5", "/var/cache/zoneminder/models/YOLO_v5_nano_leaky_416_416_3_tensorflow.dfp")) {
if (!memx->setup("yolov8", "/var/cache/zoneminder/models/YOLO_v8_nano_640_640_3_onnx.dfp")) {
//if (!memx->setup("yolov5", "/var/cache/zoneminder/models/YOLO_v5_nano_leaky_416_416_3_tensorflow.dfp")) {
delete memx;
memx = nullptr;
return 0;
@ -183,6 +191,17 @@ int main(int argc, char *argv[]) {
#else
Debug(1, "Not Including MemX");
#endif
#if HAVE_MX_ACCL_H
Debug(1, "Including MemX_Accl");
mx_accl = new MxAccl();
if (!mx_accl->setup("yolov8", "/var/cache/zoneminder/models/YOLO_v8_nano_640_640_3_onnx.dfp")) {
delete mx_accl;
mx_accl = nullptr;
return 0;
}
#else
Debug(1, "Not Including MXACCL");
#endif
std::unordered_map<unsigned int, std::shared_ptr<Monitor>> monitors;
std::unordered_map<unsigned int, AIThread *> threads;
@ -206,6 +225,9 @@ int main(int argc, char *argv[]) {
#endif
#if HAVE_MEMX_H
, memx
#endif
#if HAVE_MX_ACCL_H
, mx_accl
#endif
);
}
@ -278,6 +300,13 @@ void AIThread::Inference() {
sleep(1);
}
#endif
#ifdef HAVE_MX_ACCL_H
Debug(1, "Starting for mx_accl");
while (!terminate_ and !( mx_accl_job = mx_accl->get_job() )) {
Warning("Waiting for job");
sleep(1);
}
#endif
#if HAVE_QUADRA
#if !SOFT_DRAWBOX
@ -300,14 +329,13 @@ void AIThread::Inference() {
while (!(terminate_ or zm_terminate)) {
std::shared_ptr<ZMPacket> packet = nullptr;
Debug(1, "Getting lock");
Debug(4, "Getting shm lock");
// Need to hold the lock because it guards shared_mem as well.
std::unique_lock<std::mutex> lck(mutex_);
Debug(1, "Having lock");
Debug(4, "Having lock");
while (!(send_queue.size() or terminate_)) {
Debug(1, "Waiting");
condition_.wait(lck);
Debug(1, "Send queue size for monitor %d is %zu", monitor_->Id(), send_queue.size());
Debug(4, "Send queue size for monitor %d is %zu", monitor_->Id(), send_queue.size());
}
if (terminate_) break;
packet = send_queue.front();
@ -315,25 +343,21 @@ void AIThread::Inference() {
if (packet) {
Monitor::SharedData *shared_data = monitor_->getSharedData();
Debug(4, "Sending image %d for monitor %d", packet->image_index, monitor_->Id());
#ifdef HAVE_UNTETHER_H
speedai->send_image(job, packet->image);
Image *ai_image = monitor_->GetAnalysisImage(packet->image_index);
nlohmann::json detections = speedai->receive_detections(job, monitor_->ObjectDetection_Object_Threshold());
Debug(1, "detections %s", detections.dump().c_str());
if (detections.size()) {
draw_boxes(packet->image, ai_image, detections, monitor_->LabelSize(), monitor_->LabelSize());
//draw_boxes(drawbox_filter, drawbox_filter_ctx, packet->image, ai_image, detections, monitor_->LabelSize(), monitor_->LabelSize());
} else {
ai_image->Assign(*packet->image);
}
#ifdef HAVE_UNTETHER_H
speedai->send_image(job, packet->image);
nlohmann::json detections = speedai->receive_detections(job, monitor_->ObjectDetection_Object_Threshold());
#endif
#ifdef HAVE_MEMX_H
memx->send_image(memx_job, packet->image);
Image *ai_image = monitor_->GetAnalysisImage(packet->image_index);
nlohmann::json detections = memx->receive_detections(memx_job, monitor_->ObjectDetection_Object_Threshold());
#endif
#ifdef HAVE_MX_ACCL_H
mx_accl->send_image(mx_accl_job, packet->image);
nlohmann::json detections = mx_accl->receive_detections(mx_accl_job, monitor_->ObjectDetection_Object_Threshold());
#endif
Debug(1, "detections %s", detections.dump().c_str());
if (detections.size()) {
@ -342,7 +366,6 @@ void AIThread::Inference() {
} else {
ai_image->Assign(*packet->image);
}
#endif
shared_data->last_analysis_index = packet->image_index;
@ -436,7 +459,7 @@ void AIThread::Run() {
(send_queue.size() <= static_cast<unsigned int>(image_buffer_count))
) {
image_index = shared_data->last_decoder_index % image_buffer_count;
Debug(3, "Doing SpeedAI on monitor %d. Decoder index is %d=%d Our index is %d=%d, queue %zu",
Debug(3, "Doing ai on monitor %d. Decoder index is %d=%d Our index is %d=%d, queue %zu",
monitor_->Id(),
decoder_image_count, shared_data->last_decoder_index,
analysis_image_count, image_index, send_queue.size());
@ -469,7 +492,7 @@ void AIThread::Run() {
}
} else {
Debug(4, "Not Doing SpeedAI on monitor %d. Decoder count is %d index %d Our count is %d, last_index is %d, index %d",
Debug(4, "Not Doing ai on monitor %d. Decoder count is %d index %d Our count is %d, last_index is %d, index %d",
monitor_->Id(), decoder_image_count, shared_data->last_decoder_index,
shared_data->analysis_image_count, shared_data->last_analysis_index, image_index);
@ -651,6 +674,9 @@ AIThread::AIThread(const std::shared_ptr<Monitor> monitor
#endif
#if HAVE_MEMX_H
, MemX *p_memx
#endif
#if HAVE_MX_ACCL_H
, MxAccl *p_mx_accl
#endif
) :
monitor_(monitor), terminate_(false)
@ -660,6 +686,9 @@ AIThread::AIThread(const std::shared_ptr<Monitor> monitor
#if HAVE_MEMX_H
, memx(p_memx)
#endif
#if HAVE_MX_ACCL_H
, mx_accl(p_mx_accl)
#endif
#if HAVE_QUADRA
,drawbox_filter(nullptr)
#endif

View File

@ -24,19 +24,23 @@ extern "C" {
#if HAVE_MEMX_H
#include "zm_memx.h"
#endif
#if HAVE_MX_ACCL_H
#include "zm_mx_accl.h"
#endif
class Monitor;
class AIThread {
public:
explicit AIThread(const std::shared_ptr<Monitor> monitor
#if HAVE_UNTETHER_H
, SpeedAI *speedai
explicit AIThread(const std::shared_ptr<Monitor> monitor, SpeedAI *speedai);
#endif
#if HAVE_MEMX_H
, MemX *memx
explicit AIThread(const std::shared_ptr<Monitor> monitor, MemX *memx);
#endif
#if HAVE_MX_ACCL_H
explicit AIThread(const std::shared_ptr<Monitor> monitor, MxAccl *memx);
#endif
);
~AIThread();
AIThread(AIThread &rhs) = delete;
AIThread(AIThread &&rhs) = delete;
@ -62,6 +66,10 @@ class AIThread {
MemX *memx;
MemX::Job *memx_job;
#endif
#if HAVE_MX_ACCL_H
MxAccl *mx_accl;
MxAccl::Job *mx_accl_job;
#endif
#if HAVE_QUADRA
Quadra::filter_worker *drawbox_filter;
#endif

View File

@ -36,7 +36,7 @@
#define USE_THREAD 1
#define BATCH_SIZE 10
#define BATCH_SIZE 1
#ifdef HAVE_MEMX_H
static const char *coco_classes[] = {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
@ -51,14 +51,17 @@ static const char *coco_classes[] = {"person", "bicycle", "car", "motorcycle", "
"clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"};
MemX::MemX() :
accl(),
model_id(0),
input_port(0),
output_port(0),
obj_thresh(0.25),
nms_thresh(0.45),
terminate_(false),
batchSize(0),
inSize(0),
outSize(0),
count(0)
{
image_size = av_image_get_buffer_size(AV_PIX_FMT_RGB24, MODEL_WIDTH, MODEL_HEIGHT, 32);
}
MemX::~MemX() {
@ -68,6 +71,7 @@ MemX::~MemX() {
delete jobs.front();
jobs.pop_front();
}
memx_close(model_id);
}
bool MemX::setup(
@ -77,15 +81,35 @@ bool MemX::setup(
// Load and launch module_
Debug(1, "MemX: Loading model %s", model_file.c_str());
std::vector<int> device_ids = {0};
std::array<bool, 2> use_model_shape = {false, false};
//std::vector<int> device_ids = {0};
//std::array<bool, 2> use_model_shape = {false, false};
uint8_t model_id = 0;
// Create the accelerator object and load the DFP model
accl = new MX::Runtime::MxAccl(filesystem::path(model_file), device_ids, use_model_shape);
// bind MPU device group 0 as MX3 to model 0
memx_status status = memx_open(model_id, 0, MEMX_DEVICE_CASCADE_PLUS);
if (status == 0) {
Debug(1, "Success opening MX3");
} else {
Error("Failure openping MX3");
return false;
}
//int dfp_tag = accl.connect_dfp(model_file.c_str());
model_info = accl->get_model_info(0);
batchSize = 1; // only 1 issupported
// download weight memory and 1st model within DFP file to device
status = memx_download_model(model_id, model_file.c_str(), 0, MEMX_DOWNLOAD_TYPE_WTMEM_AND_MODEL);
if (status == 0) {
// read back flow 0 (port 0) input feature map shape for debug information
memx_get_ifmap_size(0, 0, &input_height, &input_width, &input_depth, &input_channels, &input_format);
// for example: shape = (224, 224, 1, 3), format = 1
Debug(1, "ifmap shape = (%d, %d, %d, %d), format = %d", input_height, input_width, input_depth, input_channels, input_format);
memx_get_ofmap_size(0, 0, &output_height, &output_width, &output_depth, &output_channels, &output_format);
Debug(1, "ofmap shape = (%d, %d, %d, %d), format = %d", output_height, output_width, output_depth, output_channels, output_format);
image_size = av_image_get_buffer_size(AV_PIX_FMT_RGB24, input_width, input_height, 0);
}
// Set confidence threshold and process detection results
yolov8_handle = std::make_unique<YOLOv8>(input_width, input_height, input_channels, 0, 0);
yolov8_handle->SetConfidenceThreshold(obj_thresh);
#if USE_THREAD
thread_ = std::thread(&MemX::Run, this);
#endif
@ -93,40 +117,143 @@ bool MemX::setup(
}
void MemX::Run() {
memx_status status = memx_set_stream_enable(model_id, 0);
while (!(terminate_ or zm_terminate)) {
Debug(1, "MemX locking, queue size %zu", send_queue.size());
std::unique_lock<std::mutex> lck(mutex_);
while (send_queue.size() < BATCH_SIZE and ! zm_terminate) {
Debug(1, "MemX waiting, queue size %zu", send_queue.size());
condition_.wait(lck);
}
if (send_queue.size() >= BATCH_SIZE) {
Debug(1, " batch");
for (int i = BATCH_SIZE; i > 0 ; i--) {
Job *job = send_queue.front();
Job *job = nullptr;
{
Debug(1, "MemX locking, queue size %zu", send_queue.size());
std::unique_lock<std::mutex> lck(mutex_);
while (send_queue.size() < BATCH_SIZE and ! zm_terminate) {
Debug(1, "MemX waiting, queue size %zu", send_queue.size());
condition_.wait(lck);
}
if (send_queue.size() >= BATCH_SIZE) {
Debug(1, " batch");
job = send_queue.front();
send_queue.pop_front();
std::unique_lock<std::mutex> job_lck(job->mutex_);
} else {
Debug(1, "Not batch");
}
} // end scope for lock
Debug(3, "MemX::enqueue");
SystemTimePoint starttime = std::chrono::system_clock::now();
int err = 0;
if (err) {
//Error("Failed enqueue %s", uai_err_string(err));
} else {
SystemTimePoint endtime = std::chrono::system_clock::now();
if (endtime - starttime > Milliseconds(40)) {
Warning("MemX enqueue is too slow: %.3f seconds", FPSeconds(endtime - starttime).count());
} else {
Debug(3, "MemX enqueue took: %.3f seconds", FPSeconds(endtime - starttime).count());
}
}
job->notify();
} // end foreach job in batch
} else {
Debug(1, "Not batch");
} // end if BATCH
if (job) {
job->lock();
inference(job);
Debug(1, "notifying");
job->notify();
Debug(1, "unlocking");
job->unlock();
Debug(1, "Done notifying");
job = nullptr;
} // end if job
} // end while forever
status = memx_set_stream_disable(model_id, 0);
}
#if 0
// Function to process model output and get bounding boxes
nlohmann::json MemX::get_detections(float *ofmap, int num_boxes, const cv::Mat &inframe,
std::vector<Box> &filtered_boxes,
std::vector<std::vector<float>> &filtered_mask_coefs)
{
nlohmann::json coco_predictions = nlohmann::json::array();
std::vector<Box> all_boxes;
std::vector<float> all_scores;
std::vector<std::vector<float>> mask_coefs;
// Iterate through the model outputs
for (int i = 0; i < num_boxes; ++i)
{
float *feature_ptr = ofmap_t_ptr + i * num_features;
// get best class information
float confidence;
int class_id;
get_best_class_info(feature_ptr, confidence, class_id);
if (confidence > conf_thresh)
{
// Extract and scale the bounding box coordinates
float x0 = feature_ptr[0];
float y0 = feature_ptr[1];
float w = feature_ptr[2];
float h = feature_ptr[3];
// Convert boxes from center format (cxcywh) to corner format (xyxy)
int x1 = static_cast<int>(x0 - 0.5f * w);
int y1 = static_cast<int>(y0 - 0.5f * h);
int x2 = x1 + w;
int y2 = y1 + h;
// Rescale boxes to original image size
x1 = (x1 - pad_w) / scale_ratio;
x2 = (x2 - pad_w) / scale_ratio;
y1 = (y1 - pad_h) / scale_ratio;
y2 = (y2 - pad_h) / scale_ratio;
// Clamp box boundaries to image dimensions
x1 = std::max(0, std::min(x1, ori_image_width - 1));
x2 = std::max(0, std::min(x2, ori_image_width - 1));
y1 = std::max(0, std::min(y1, ori_image_height - 1));
y2 = std::max(0, std::min(y2, ori_image_height - 1));
// Add detected box to the list
all_boxes.emplace_back(x1, y1, x2, y2, class_id, confidence);
all_scores.emplace_back(confidence);
cv_boxes.emplace_back(x1, y1, x2 - x1, y2 - y1);
// Add detected mask coefficient to the list
std::vector<float> mask_coef(feature_ptr + 4 + num_class, feature_ptr + num_features);
mask_coefs.push_back(mask_coef);
}
}
o // Apply Non-Maximum Suppression (NMS) to remove overlapping boxes
std::vector<int> nms_result;
if (!cv_boxes.empty()) {
cv::dnn::NMSBoxes(cv_boxes, all_scores, conf_thresh, nms_thresh, nms_result);
for (int idx : nms_result)
{
filtered_boxes.push_back(all_boxes[idx]);
filtered_mask_coefs.push_back(mask_coefs[idx]);
}
}
}
#endif
memx_status MemX::inference(Job *job) {
const int timeout = 200; // 200 ms
Debug(3, "MemX::enqueue");
SystemTimePoint starttime = std::chrono::system_clock::now();
memx_status status = memx_stream_ifmap(model_id, input_port, static_cast<void *>(job->ifmap.data()), timeout);
if (status) {
Error("Failed enqueue %d", status);
return status;
} else {
SystemTimePoint endtime = std::chrono::system_clock::now();
if (endtime - starttime > Milliseconds(40)) {
Warning("MemX enqueue is too slow: %.3f seconds", FPSeconds(endtime - starttime).count());
} else {
Debug(3, "MemX enqueue took: %.3f seconds", FPSeconds(endtime - starttime).count());
}
}
// 3. Read output feature map from device after inference
if (0 == status) {
Debug(1, "Reading ofmap");
status = memx_stream_ofmap(model_id, output_port, static_cast<void *>(job->ofmap.data()), timeout);
Debug(1, "Status from reading ofmap %d", status);
}
return status;
}
MemX::Job * MemX::send_image(Job *job, Image *image) {
@ -145,26 +272,65 @@ MemX::Job * MemX::send_packet(Job *job, std::shared_ptr<ZMPacket> packet) {
}
MemX::Job * MemX::get_job() {
Job *job = new Job(accl);
Job *job = new Job();
AVFrame *frame = job->scaled_frame.get();
job->scaled_frame->width = input_width;
job->scaled_frame->height = input_height;
job->scaled_frame->format = AV_PIX_FMT_RGB24;
Debug(1, "sw scale: scaled frame data 0x%lx, linesize %d/%d/%d/%d, height %d",
(unsigned long)frame->data, frame->linesize[0], frame->linesize[1],
frame->linesize[2], frame->linesize[3], frame->height);
#if 1
if (av_frame_get_buffer(job->scaled_frame.get(), 32)) {
Error("cannot allocate scaled frame buffer");
return nullptr;
}
#endif
job->ifmap.resize(input_width*input_width*input_depth*input_channels);
job->ofmap.resize(output_width*output_width*output_depth*output_channels);
#if 0
AVBufferRef *ref = av_buffer_create(reinterpret_cast<uint8_t *>(&job->ifmap),
input_width*input_width*input_depth*input_channels,
dont_free, /* Free callback */
nullptr, /* opaque */
0 /* flags */
);
if (!ref) {
Warning("Failed to create av_buffer");
}
job->scaled_frame->buf[0] = ref;
int rc_size = av_image_fill_arrays(
job->scaled_frame->data, job->scaled_frame->linesize,
reinterpret_cast<uint8_t *>(&job->ifmap), AV_PIX_FMT_RGB24, input_width, input_height,
0 //alignment
);
if (rc_size < 0) {
Error("Problem setting up data pointers into image %s", av_make_error_string(rc_size).c_str());
} else {
Debug(1, "Image.Populate frame rc/size %d %dx%d", rc_size, job->scaled_frame->width, job->scaled_frame->height);
}
#endif
std::unique_lock<std::mutex> lck(mutex_);
jobs.push_back(job);
return job;
}
} // end MemX::Job * MemX::get_job()
MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) {
count++;
Debug(1, "MemX::send_frame %d", count);
SystemTimePoint starttime = std::chrono::system_clock::now();
job->sw_scale_ctx = sws_getCachedContext(job->sw_scale_ctx,
avframe->width, avframe->height, static_cast<AVPixelFormat>(avframe->format),
MODEL_WIDTH, MODEL_HEIGHT, AV_PIX_FMT_RGB24,
input_width, input_height, AV_PIX_FMT_RGB24,
SWS_BICUBIC, nullptr, nullptr, nullptr);
//Debug(1, "Start scale");
@ -176,8 +342,53 @@ MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) {
avframe->linesize[2], avframe->linesize[3], avframe->height, job->scaled_frame->linesize[0]);
return nullptr;
}
job->m_width_rescale = ((float)MODEL_WIDTH / (float)avframe->width);
job->m_height_rescale = ((float)MODEL_HEIGHT / (float)avframe->height);
job->m_width_rescale = ((float)input_width / (float)avframe->width);
job->m_height_rescale = ((float)input_height / (float)avframe->height);
std::vector<float *> accl_input_data;
accl_input_data.resize(input_width*input_height*input_depth*input_channels);
#if 0
yolov8_handle->PreProcess(job->scaled_frame->data[0], input_width, input_height, accl_input_data);
#else
int offset = 0; //padding_height_ * input_width_ * 3;
float *buffer_ptr = job->ifmap.data() + offset; // YOLOv8 has 1 input
uint8_t *input_buffer_ptr = job->scaled_frame->data[0];
for (int row = 0; row < input_height; ++row) {
for (int col = 0; col < input_width; ++col) {
#pragma omp simd
for(int i=0; i < 3; i++) {
buffer_ptr[0] = input_buffer_ptr[0] / (float)255.0; // red
buffer_ptr[1] = input_buffer_ptr[1] / (float)255.0; // green
buffer_ptr[2] = input_buffer_ptr[2] / (float)255.0; // blue
}
buffer_ptr += 3;
input_buffer_ptr += 3;
}
}
#endif
#if 1
inference(job);
YOLOv8Result result;
//yolov8_handle->PostProcess(job->ofmap, result);
#else
// Prevents processing?
job->lock();
//std::unique_lock<std::mutex> job_lck(job->mutex_);
{
Debug(1, "Locking");
std::unique_lock<std::mutex> lck(mutex_);
send_queue.push_back(job);
condition_.notify_all();
}
Debug(1, "Waiting for inference");
job->wait();
Debug(1, "Done Waiting");
#endif
starttime = std::chrono::system_clock::now();
SystemTimePoint endtime = std::chrono::system_clock::now();
if (endtime - starttime > Milliseconds(60)) {
@ -190,10 +401,11 @@ MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) {
const nlohmann::json MemX::receive_detections(Job *job, float object_threshold) {
nlohmann::json coco_object;
coco_object = convert_predictions_to_coco_format(job->predictions_buffer, job->m_width_rescale, job->m_height_rescale, object_threshold);
coco_object = convert_predictions_to_coco_format(job->ofmap, job->m_width_rescale, job->m_height_rescale, object_threshold);
return coco_object;
}
#if 0
nlohmann::json MemX::convert_predictions_to_coco_format(
const std::vector<float>& predictions,
float m_width_rescale,
@ -238,5 +450,6 @@ nlohmann::json MemX::convert_predictions_to_coco_format(
}
return coco_predictions;
}
#endif
#endif

View File

@ -20,19 +20,22 @@ extern "C" {
}
#ifdef HAVE_MEMX_H
//#include "memx/memx.h"
#include <memx/accl/MxAccl.h>
// FIXME move model stuff to it's ownfile
#define MODEL_WIDTH 640
#define MODEL_HEIGHT 640
// * 6; // 256 boxes, each with 6 elements [l, t, r, b, class, score]
#define NUM_NMS_PREDICTIONS 256
#include "memx/memx.h"
#include "zm_memx_yolov8.h"
class MemX {
private:
MX::Runtime::MxAccl *accl;
MX::Types::MxModelInfo model_info;
std::unique_ptr<YOLOv8> yolov8_handle;
uint8_t model_id;
uint8_t input_port;
uint8_t output_port;
float obj_thresh = 0.25;
float nms_thresh = 0.45;
int32_t input_height, input_width, input_depth, input_channels, input_format;
int32_t output_height, output_width, output_depth, output_channels, output_format;
std::mutex mutex_;
std::condition_variable condition_;
@ -50,7 +53,6 @@ class MemX {
public:
class Job {
private:
MX::Runtime::MxAccl *m_accl;
public:
int index;
@ -58,13 +60,15 @@ class MemX {
float m_width_rescale;
float m_height_rescale;
SwsContext *sw_scale_ctx;
std::vector<float> predictions_buffer;
std::vector<float> ifmap;
std::vector<float> ofmap;
std::mutex mutex_;
std::unique_lock<std::mutex> lck_;
std::condition_variable condition_;
Job(MX::Runtime::MxAccl *p_accl) :
m_accl(p_accl),
Job() :
index(0),
m_width_rescale(1.0),
m_height_rescale(1.0),
@ -72,10 +76,6 @@ class MemX {
lck_(mutex_, std::defer_lock)
{
scaled_frame = av_frame_ptr(zm_av_frame_alloc());
scaled_frame->width = MODEL_WIDTH;
scaled_frame->height = MODEL_HEIGHT;
scaled_frame->format = AV_PIX_FMT_RGB24;
predictions_buffer.resize(NUM_NMS_PREDICTIONS*6);
};
~Job() {
if (sw_scale_ctx) {
@ -85,14 +85,12 @@ class MemX {
void setFrame(AVFrame *frame) {
}
Job(Job &&in) :
m_accl(in.m_accl),
index(in.index),
scaled_frame(std::move(in.scaled_frame)),
m_width_rescale(in.m_width_rescale),
m_height_rescale(in.m_height_rescale)
{
Debug(1, "In move");
//in.scaled_frame = nullptr;
}
void lock() {
lck_.lock();
@ -117,6 +115,7 @@ class MemX {
Job * send_packet(Job *job, std::shared_ptr<ZMPacket>);
Job * send_image(Job *job, Image *image);
Job * send_frame(Job *job, AVFrame *);
memx_status inference(Job *job);
const nlohmann::json receive_detections(Job *job, float threshold);
nlohmann::json convert_predictions_to_coco_format(const std::vector<float>& predictions, float, float, float threshold);

313
src/zm_memx_accl_yolov8.cpp Normal file
View File

@ -0,0 +1,313 @@
#include "zm_memx_yolov8.h"
#include "zm_logger.h"
#include <stdexcept>
YOLOv8::YOLOv8(size_t input_width, size_t input_height, size_t input_channel, float confidence_thresh, float iou_thresh) {
Init(input_width, input_height, input_channel, confidence_thresh, iou_thresh, COCO_CLASS_NUMBER, COCO_NAMES);
}
YOLOv8::YOLOv8(size_t input_width, size_t input_height, size_t input_channel,
float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels)
{
Init(input_width, input_height, input_channel, confidence_thresh, iou_thresh, class_count, class_labels);
}
void YOLOv8::Init(size_t accl_input_width, size_t accl_input_height, size_t accl_input_channel,
float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels)
{
if (confidence_thresh < 0.0f || confidence_thresh > 1.0f || iou_thresh < 0.0f || iou_thresh > 1.0f) {
throw std::invalid_argument("Confidence and IOU threshold must be between 0.0 and 1.0.");
}
if (accl_input_width != accl_input_height) {
throw std::invalid_argument("The YOLOv8 api requires model input to have equal width and height dimensions."
"Currently, inconsistent input dimensions are not supported.");
}
accl_input_width_ = accl_input_width;
accl_input_height_ = accl_input_height;
accl_input_channel_ = accl_input_channel;
confidence_thresh_ = confidence_thresh;
iou_thresh_ = iou_thresh;
class_labels_ = class_labels;
class_count_ = class_count;
valid_input_ = false;
yolo_post_layers_[0] = {
.coordinate_ofmap_flow_id = 0,
.confidence_ofmap_flow_id = 1,
.width = accl_input_width_ / 8, // L0_HW, 640 / 8 = 80
.height = accl_input_height_ / 8, // L0_HW, 640 / 8 = 80
.ratio = 8,
.coordinate_fmap_size = 64,
};
yolo_post_layers_[1] = {
.coordinate_ofmap_flow_id = 2,
.confidence_ofmap_flow_id = 3,
.width = accl_input_width_ / 16, // L1_HW, 640 / 16 = 40
.height = accl_input_height_ / 16, // L1_HW, 640 / 16 = 40
.ratio = 16,
.coordinate_fmap_size = 64,
};
yolo_post_layers_[2] = {
.coordinate_ofmap_flow_id = 4,
.confidence_ofmap_flow_id = 5,
.width = accl_input_width_ / 32, // L2_HW, 640 / 32 = 20
.height = accl_input_height_ / 32, // L2_HW, 640 / 32 = 20
.ratio = 32,
.coordinate_fmap_size = 64,
};
}
YOLOv8::~YOLOv8()
{
}
float YOLOv8::GetConfidenceThreshold() {
std::lock_guard<std::mutex> guard(confidence_mutex_);
return confidence_thresh_;
}
void YOLOv8::SetConfidenceThreshold(float confidence) {
if (confidence < 0.0f || confidence > 1.0f) {
throw std::invalid_argument("Confidence threshold must be between 0.0 and 1.0.");
}
std::lock_guard<std::mutex> guard(confidence_mutex_);
confidence_thresh_ = confidence;
}
bool YOLOv8::IsHorizontalInput(int disp_width, int disp_height) {
if (disp_height > disp_width) {
Error("Invalid display image: only horizontal images are supported.");
return false;
}
return true;
}
void YOLOv8::ComputePadding(int disp_width, int disp_height) {
if (!IsHorizontalInput(disp_width, disp_height))
return;
// accl_input_height_ is equal to accl_input_width_
letterbox_ratio_ = (float)accl_input_height_ / mxutil_max(disp_width, disp_height);
letterbox_width_ = disp_width * letterbox_ratio_;
letterbox_height_ = disp_height * letterbox_ratio_;
padding_width_ = (accl_input_width_ - letterbox_width_) / 2;
padding_height_ = (accl_input_height_ - letterbox_height_) / 2;
valid_input_ = true;
}
void YOLOv8::PreProcess(uint8_t *rgb_data, int image_width, int image_height, std::vector<float *> input_buffers) {
if (!valid_input_) {
throw std::runtime_error("Make sure to call ComputePadding() before further processing.");
}
int offset = padding_height_ * accl_input_width_ * 3;
float *buffer_ptr = input_buffers[0] + offset; // YOLOv8 has 1 input
#if 0
cv::Mat src_img = cv::Mat(image_height, image_width, CV_8UC3, rgb_data);
cv::Mat resized_img;
cv::resize(src_img, resized_img, cv::Size(letterbox_width_, letterbox_height_), 0, 0, cv::INTER_LINEAR);
for (int row = 0; row < resized_img.rows; ++row) {
const cv::Vec3b *pixel_row = resized_img.ptr<cv::Vec3b>(row);
for (int col = 0; col < resized_img.cols; ++col) {
const cv::Vec3b &pixelValue = pixel_row[col];
#pragma omp simd
for(int i=0; i < 3; i++) {
buffer_ptr[0] = pixelValue[0] / (float)255.0; // red
buffer_ptr[1] = pixelValue[1] / (float)255.0; // green
buffer_ptr[2] = pixelValue[2] / (float)255.0; // blue
}
buffer_ptr += 3;
}
}
#else
uint8_t *input_buffer_ptr = rgb_data;
for (int row = 0; row < image_height; ++row) {
for (int col = 0; col < image_width; ++col) {
#pragma omp simd
for(int i=0; i < 3; i++) {
buffer_ptr[0] = input_buffer_ptr[0] / (float)255.0; // red
buffer_ptr[1] = input_buffer_ptr[1] / (float)255.0; // green
buffer_ptr[2] = input_buffer_ptr[2] / (float)255.0; // blue
}
buffer_ptr += 3;
input_buffer_ptr += 3;
}
}
#endif
}
void YOLOv8::GetDetection(
std::queue<BBox> &bboxes,
int layer_id,
float *confidence_cell_buf,
float *coordinate_cell_buf,
int row, int col, float *confs_tmp)
{
// process confidence score
float best_label_score = -1.0;
int best_label;
// Force use of SIMD to calculate sigmoid.
//
// The space tradeoff of the temp result buffer
// is more than worth the speedup / CPU usage reduction
// (~75% CPU [down from 90% due to other changes] down to 59% CPU on my laptop)
#pragma omp simd
for (size_t label = 0; label < class_count_; label++) {
confs_tmp[label] = mxutil_prepost_sigmoid(confidence_cell_buf[label]);
}
// no way to avoid O(n) here, though
for (size_t label = 0; label < class_count_; label++) {
if (confs_tmp[label] > best_label_score) {
best_label_score = confs_tmp[label];
best_label = label;
}
}
if (best_label_score < confidence_thresh_)
return;
std::vector<float> feature_value{};
for (int channel = 0; channel < 4; channel++) { // split 64 into 4*16
float value = 0.0;
float *feature_buf = coordinate_cell_buf + channel * 16;
float softmax_sum = 0.0;
float local_max = feature_buf[0];
// apply softmax and weighted sum
for (int i = 1; i < 16; i++) {
if (feature_buf[i] > local_max)
local_max = feature_buf[i];
}
// more SIMD hints
#pragma omp simd reduction(+:softmax_sum)
for (int i = 0; i < 16; i++) {
softmax_sum += expf(feature_buf[i] - local_max);
}
// more SIMD hints
#pragma omp simd reduction(+:value)
for (int i = 0; i < 16; i++) {
value += ((float)i * (float)(expf(feature_buf[i] - local_max) / softmax_sum));
}
feature_value.push_back(value);
}
float center_x, center_y, w, h;
center_x = (feature_value[2] - feature_value[0] + 2 * (0.5 + ((float)col))) * 0.5 * yolo_post_layers_[layer_id].ratio;
center_y = (feature_value[3] - feature_value[1] + 2 * (0.5 + ((float)row))) * 0.5 * yolo_post_layers_[layer_id].ratio;
w = (feature_value[2] + feature_value[0]) * yolo_post_layers_[layer_id].ratio;
h = (feature_value[3] + feature_value[1]) * yolo_post_layers_[layer_id].ratio;
// printf("[Layer%d] (%0.3f) %s\t: %.2f\t,%.2f\t,%.2f\t,%.2f\t]\n", layer_id, best_label_score, class_labels_[best_label], center_x, center_y, w, h);
float min_x = mxutil_max(center_x - 0.5 * w, .0);
float min_y = mxutil_max(center_y - 0.5 * h, .0);
float max_x = mxutil_min(center_x + 0.5 * w, accl_input_width_);
float max_y = mxutil_min(center_y + 0.5 * h, accl_input_height_);
BBox bbox(best_label, best_label_score, min_x, min_y, max_x, max_y);
non_maximum_suppression(bboxes, bbox, iou_thresh_);
}
void YOLOv8::PostProcess(std::vector<float *> output_buffers, YOLOv8Result &result) {
if (!valid_input_) {
throw std::runtime_error("Make sure to call ComputePadding() before further processing.");
}
if (output_buffers.empty()) {
throw std::invalid_argument("output_buffers cannot be null.");
}
float *confs_tmp = new float[class_count_];
for (size_t layer_id = 0; layer_id < kNumPostProcessLayers; ++layer_id) {
const auto &layer = yolo_post_layers_[layer_id];
const int confidence_floats_per_row = (layer.width * class_count_); // 80 x 80
const int coordinate_floats_per_row = (layer.width * layer.coordinate_fmap_size); // 80 x 64
const int conf_id = layer.confidence_ofmap_flow_id;
const int coord_id = layer.coordinate_ofmap_flow_id;
float *confidence_base = output_buffers[conf_id];
float *coordinate_base = output_buffers[coord_id];
if (!confidence_base || !coordinate_base) {
throw std::invalid_argument("One or more output buffers are null.");
}
for (size_t row = 0; row < layer.height; row++) {
float *confidence_row_buf = confidence_base + row * confidence_floats_per_row;
float *coordinate_row_buf = coordinate_base + row * coordinate_floats_per_row;
for (size_t col = 0; col < layer.width; col++) {
float *confidence_cell_buf = confidence_row_buf + col * class_count_;
float *coordinate_cell_buf = coordinate_row_buf + col * layer.coordinate_fmap_size;
GetDetection(result.bboxes, layer_id, confidence_cell_buf, coordinate_cell_buf, row, col, confs_tmp);
}
}
}
delete [] confs_tmp;
}
void YOLOv8::CalculateBboxParams(const float *feature_values, int layer_id, int row, int col,
float &center_x, float &center_y, float &box_width, float &box_height)
{
const float ratio = yolo_post_layers_[layer_id].ratio;
center_x = (feature_values[2] - feature_values[0] + 2.0f * (0.5f + static_cast<float>(col))) * 0.5f * ratio;
center_y = (feature_values[3] - feature_values[1] + 2.0f * (0.5f + static_cast<float>(row))) * 0.5f * ratio;
box_width = (feature_values[2] + feature_values[0]) * ratio;
box_height = (feature_values[3] + feature_values[1]) * ratio;
}
void YOLOv8::GetFeatureValues(const float *coordinate_buffer, float *feature_values)
{
constexpr int kNumChannels = 4;
constexpr int kChannelSize = 16;
for (int channel = 0; channel < kNumChannels; ++channel) {
const float *feature_buf = coordinate_buffer + channel * kChannelSize;
float local_max = feature_buf[0];
// Find local max
for (int i = 1; i < kChannelSize; ++i) {
if (feature_buf[i] > local_max) {
local_max = feature_buf[i];
}
}
// Compute softmax and weighted sum in a single pass
float softmax_sum = 0.0f;
float value = 0.0f;
float temp[kChannelSize];
for (int i = 0; i < kChannelSize; ++i) {
temp[i] = feature_buf[i] - local_max;
float exp_val = expf(temp[i]);
softmax_sum += exp_val;
value += static_cast<float>(i) * exp_val;
}
feature_values[channel] = value / softmax_sum;
}
}

95
src/zm_memx_accl_yolov8.h Normal file
View File

@ -0,0 +1,95 @@
#pragma once
#include "zm_memx_yolo_core.h"
#include <queue>
#include <mutex>
class YOLOv8 {
public:
/** @brief Constructor for using official 80 classes COCO dataset. */
YOLOv8(size_t input_width, size_t input_height, size_t input_channel,
float confidence_thresh, float iou_thresh);
/** @brief Constructor for using a customized dataset. */
YOLOv8(size_t input_width, size_t input_height, size_t input_channel,
float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels);
~YOLOv8();
/**
* @brief Pre-process the input image data for YOLOv8 model inference.
* @param rgb_data Pointer to the input image data in RGB format.
* @param image_width Width of the display image.
* @param image_height Height of the display image.
* @param input_buffers Vector of pointers to the input buffers used by the accelerator for processing. Should be zero-initialized.
*/
void PreProcess(uint8_t *rgb_data, int image_width, int image_height, std::vector<float *> input_buffers);
/**
* @brief Post-process the output data from the YOLOv8 model.
* @param output_buffers Vector of pointers to the output buffers from the accelerator.
* @param result Reference to the structure where the decoded bounding box results will be stored.
*/
void PostProcess(std::vector<float *> output_buffers, YOLOv8Result &result);
/** @brief Clear the detection results. */
void ClearDetectionResults(YOLOv8Result &result);
/** @brief Setters and getters for the confidence threshold. */
void SetConfidenceThreshold(float confidence);
float GetConfidenceThreshold();
/** @brief Compute padding values for letterboxing from the display image. */
void ComputePadding(int disp_width, int disp_height);
/** @brief Ensure the input dimensions are valid for horizontal display images only. */
bool IsHorizontalInput(int disp_width, int disp_height);
private:
/** @brief Structure representing per-layer information of YOLOv8 output. */
struct LayerParams
{
uint8_t coordinate_ofmap_flow_id;
uint8_t confidence_ofmap_flow_id;
size_t width;
size_t height;
size_t ratio;
size_t coordinate_fmap_size;
};
/** @brief Initialization method to set up YOLOv8 model parameters. */
void Init(size_t accl_input_width, size_t accl_input_height, size_t accl_input_channel,
float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels);
/** @brief Helper methods for building detections from model output. */
void GetDetection(std::queue<BBox> &bounding_boxes, int layer_id, float *confidence_buffer,
float *coordinate_buffer, int row, int col, float *confs_tmp);
/** @brief Helper methods for calculating bounding box parameters from feature values. */
void CalculateBboxParams(const float *feature_values, int layer_id, int row, int col,
float &center_x, float &center_y, float &box_width, float &box_height);
void GetFeatureValues(const float *coordinate_buffer, float *feature_values);
static constexpr size_t kNumPostProcessLayers = 3;
struct LayerParams yolo_post_layers_[kNumPostProcessLayers];
// Model-specific parameters.
const char **class_labels_;
size_t class_count_;
size_t accl_input_width_; // Input width to accelerator, obtained by dfp.
size_t accl_input_height_; // Input height to accelerator, obtained by dfp.
size_t accl_input_channel_; // Input channel to accelerator, obtained by dfp.
// Confidence and IOU thresholds.
std::mutex confidence_mutex_;
float confidence_thresh_;
float iou_thresh_;
// Letterbox ratio and padding.
float letterbox_ratio_;
int letterbox_width_;
int letterbox_height_;
int padding_height_;
int padding_width_;
bool valid_input_;
};

View File

@ -10,6 +10,7 @@
#cmakedefine HAVE_EXECINFO_H 1
#cmakedefine HAVE_UNTETHER_H 1
#cmakedefine HAVE_MEMX_H 1
#cmakedefine HAVE_MX_ACCL_H 1
#cmakedefine HAVE_QUADRA 1
#cmakedefine HAVE_UCONTEXT_H 1
#cmakedefine HAVE_SYS_SENDFILE_H 1