Rough in memx support
parent
1daf5a40f4
commit
3ddf7bd065
|
|
@ -32,12 +32,12 @@ endif()
|
|||
#set(CMAKE_INSTALL_ALWAYS ON)
|
||||
|
||||
# Default CLFAGS and CXXFLAGS:
|
||||
set(CMAKE_C_FLAGS_RELEASE "-O2")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O2")
|
||||
set(CMAKE_C_FLAGS_DEBUG "-g")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "-g")
|
||||
set(CMAKE_C_FLAGS_OPTIMISED "-O3")
|
||||
set(CMAKE_CXX_FLAGS_OPTIMISED "-O3")
|
||||
set(CMAKE_C_FLAGS_RELEASE "-O2 -fopenmp")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "-O2 -fopenmp")
|
||||
set(CMAKE_C_FLAGS_DEBUG "-g -fopenmp")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "-g -fopenmp")
|
||||
set(CMAKE_C_FLAGS_OPTIMISED "-O3 -fopenmp")
|
||||
set(CMAKE_CXX_FLAGS_OPTIMISED "-O3 -fopenmp")
|
||||
set(CMAKE_INCLUDE_CURRENT_DIR ON)
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
|
|
@ -357,10 +357,21 @@ endif()
|
|||
|
||||
find_file(HAVE_MEMX_H memx.h /usr/include/memx)
|
||||
if(HAVE_MEMX_H)
|
||||
find_library(MEMX_LIBRARIES mx_accl)
|
||||
find_library(MEMX_LIBRARIES memx)
|
||||
if(MEMX_LIBRARIES)
|
||||
set(optlibsfound "${optlibsfound} mx_accl")
|
||||
set(optlibsfound "${optlibsfound} memx")
|
||||
list(APPEND ZM_BIN_LIBS "${MEMX_LIBRARIES}")
|
||||
else()
|
||||
set(optlibsnotfound "${optlibsnotfound} memx")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_file(HAVE_MX_ACCL_H MxAccl.h /usr/include/memx/accl)
|
||||
if(HAVE_MX_ACCL_H)
|
||||
find_library(MX_ACCL_LIBRARIES mx_accl)
|
||||
if(MX_ACCL_LIBRARIES)
|
||||
set(optlibsfound "${optlibsfound} mx_accl")
|
||||
list(APPEND ZM_BIN_LIBS "${MX_ACCL_LIBRARIES}")
|
||||
else()
|
||||
set(optlibsnotfound "${optlibsnotfound} mx_accl")
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -37,6 +37,9 @@ set(ZM_BIN_SRC_FILES
|
|||
zm_libvnc_camera.cpp
|
||||
zm_local_camera.cpp
|
||||
zm_memx.cpp
|
||||
zm_mx_accl.cpp
|
||||
zm_memx_yolo_core.cpp
|
||||
zm_memx_yolov8.cpp
|
||||
zm_monitor.cpp
|
||||
zm_monitor_monitorlink.cpp
|
||||
zm_monitor_rtsp2web.cpp
|
||||
|
|
|
|||
|
|
@ -87,11 +87,17 @@ void Usage() {
|
|||
Quadra quadra;
|
||||
#endif
|
||||
|
||||
#undef HAVE_MEMX_H
|
||||
#if HAVE_MEMX_H
|
||||
#include "zm_memx.h"
|
||||
MemX *memx;
|
||||
#endif
|
||||
|
||||
#if HAVE_MX_ACCL_H
|
||||
#include "zm_mx_accl.h"
|
||||
MxAccl *mx_accl;
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
self = argv[0];
|
||||
|
||||
|
|
@ -172,10 +178,12 @@ int main(int argc, char *argv[]) {
|
|||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_MEMX_H
|
||||
Debug(1, "Including MemX");
|
||||
memx = new MemX();
|
||||
if (!memx->setup("yolov5", "/var/cache/zoneminder/models/YOLO_v5_nano_leaky_416_416_3_tensorflow.dfp")) {
|
||||
if (!memx->setup("yolov8", "/var/cache/zoneminder/models/YOLO_v8_nano_640_640_3_onnx.dfp")) {
|
||||
//if (!memx->setup("yolov5", "/var/cache/zoneminder/models/YOLO_v5_nano_leaky_416_416_3_tensorflow.dfp")) {
|
||||
delete memx;
|
||||
memx = nullptr;
|
||||
return 0;
|
||||
|
|
@ -183,6 +191,17 @@ int main(int argc, char *argv[]) {
|
|||
#else
|
||||
Debug(1, "Not Including MemX");
|
||||
#endif
|
||||
#if HAVE_MX_ACCL_H
|
||||
Debug(1, "Including MemX_Accl");
|
||||
mx_accl = new MxAccl();
|
||||
if (!mx_accl->setup("yolov8", "/var/cache/zoneminder/models/YOLO_v8_nano_640_640_3_onnx.dfp")) {
|
||||
delete mx_accl;
|
||||
mx_accl = nullptr;
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
Debug(1, "Not Including MXACCL");
|
||||
#endif
|
||||
|
||||
std::unordered_map<unsigned int, std::shared_ptr<Monitor>> monitors;
|
||||
std::unordered_map<unsigned int, AIThread *> threads;
|
||||
|
|
@ -206,6 +225,9 @@ int main(int argc, char *argv[]) {
|
|||
#endif
|
||||
#if HAVE_MEMX_H
|
||||
, memx
|
||||
#endif
|
||||
#if HAVE_MX_ACCL_H
|
||||
, mx_accl
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
|
@ -278,6 +300,13 @@ void AIThread::Inference() {
|
|||
sleep(1);
|
||||
}
|
||||
#endif
|
||||
#ifdef HAVE_MX_ACCL_H
|
||||
Debug(1, "Starting for mx_accl");
|
||||
while (!terminate_ and !( mx_accl_job = mx_accl->get_job() )) {
|
||||
Warning("Waiting for job");
|
||||
sleep(1);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_QUADRA
|
||||
#if !SOFT_DRAWBOX
|
||||
|
|
@ -300,14 +329,13 @@ void AIThread::Inference() {
|
|||
|
||||
while (!(terminate_ or zm_terminate)) {
|
||||
std::shared_ptr<ZMPacket> packet = nullptr;
|
||||
Debug(1, "Getting lock");
|
||||
Debug(4, "Getting shm lock");
|
||||
// Need to hold the lock because it guards shared_mem as well.
|
||||
std::unique_lock<std::mutex> lck(mutex_);
|
||||
Debug(1, "Having lock");
|
||||
Debug(4, "Having lock");
|
||||
while (!(send_queue.size() or terminate_)) {
|
||||
Debug(1, "Waiting");
|
||||
condition_.wait(lck);
|
||||
Debug(1, "Send queue size for monitor %d is %zu", monitor_->Id(), send_queue.size());
|
||||
Debug(4, "Send queue size for monitor %d is %zu", monitor_->Id(), send_queue.size());
|
||||
}
|
||||
if (terminate_) break;
|
||||
packet = send_queue.front();
|
||||
|
|
@ -315,25 +343,21 @@ void AIThread::Inference() {
|
|||
if (packet) {
|
||||
Monitor::SharedData *shared_data = monitor_->getSharedData();
|
||||
Debug(4, "Sending image %d for monitor %d", packet->image_index, monitor_->Id());
|
||||
#ifdef HAVE_UNTETHER_H
|
||||
speedai->send_image(job, packet->image);
|
||||
|
||||
Image *ai_image = monitor_->GetAnalysisImage(packet->image_index);
|
||||
nlohmann::json detections = speedai->receive_detections(job, monitor_->ObjectDetection_Object_Threshold());
|
||||
Debug(1, "detections %s", detections.dump().c_str());
|
||||
|
||||
if (detections.size()) {
|
||||
draw_boxes(packet->image, ai_image, detections, monitor_->LabelSize(), monitor_->LabelSize());
|
||||
//draw_boxes(drawbox_filter, drawbox_filter_ctx, packet->image, ai_image, detections, monitor_->LabelSize(), monitor_->LabelSize());
|
||||
} else {
|
||||
ai_image->Assign(*packet->image);
|
||||
}
|
||||
#ifdef HAVE_UNTETHER_H
|
||||
speedai->send_image(job, packet->image);
|
||||
nlohmann::json detections = speedai->receive_detections(job, monitor_->ObjectDetection_Object_Threshold());
|
||||
#endif
|
||||
#ifdef HAVE_MEMX_H
|
||||
memx->send_image(memx_job, packet->image);
|
||||
|
||||
Image *ai_image = monitor_->GetAnalysisImage(packet->image_index);
|
||||
nlohmann::json detections = memx->receive_detections(memx_job, monitor_->ObjectDetection_Object_Threshold());
|
||||
#endif
|
||||
#ifdef HAVE_MX_ACCL_H
|
||||
mx_accl->send_image(mx_accl_job, packet->image);
|
||||
nlohmann::json detections = mx_accl->receive_detections(mx_accl_job, monitor_->ObjectDetection_Object_Threshold());
|
||||
#endif
|
||||
Debug(1, "detections %s", detections.dump().c_str());
|
||||
|
||||
if (detections.size()) {
|
||||
|
|
@ -342,7 +366,6 @@ void AIThread::Inference() {
|
|||
} else {
|
||||
ai_image->Assign(*packet->image);
|
||||
}
|
||||
#endif
|
||||
|
||||
shared_data->last_analysis_index = packet->image_index;
|
||||
|
||||
|
|
@ -436,7 +459,7 @@ void AIThread::Run() {
|
|||
(send_queue.size() <= static_cast<unsigned int>(image_buffer_count))
|
||||
) {
|
||||
image_index = shared_data->last_decoder_index % image_buffer_count;
|
||||
Debug(3, "Doing SpeedAI on monitor %d. Decoder index is %d=%d Our index is %d=%d, queue %zu",
|
||||
Debug(3, "Doing ai on monitor %d. Decoder index is %d=%d Our index is %d=%d, queue %zu",
|
||||
monitor_->Id(),
|
||||
decoder_image_count, shared_data->last_decoder_index,
|
||||
analysis_image_count, image_index, send_queue.size());
|
||||
|
|
@ -469,7 +492,7 @@ void AIThread::Run() {
|
|||
}
|
||||
|
||||
} else {
|
||||
Debug(4, "Not Doing SpeedAI on monitor %d. Decoder count is %d index %d Our count is %d, last_index is %d, index %d",
|
||||
Debug(4, "Not Doing ai on monitor %d. Decoder count is %d index %d Our count is %d, last_index is %d, index %d",
|
||||
monitor_->Id(), decoder_image_count, shared_data->last_decoder_index,
|
||||
shared_data->analysis_image_count, shared_data->last_analysis_index, image_index);
|
||||
|
||||
|
|
@ -651,6 +674,9 @@ AIThread::AIThread(const std::shared_ptr<Monitor> monitor
|
|||
#endif
|
||||
#if HAVE_MEMX_H
|
||||
, MemX *p_memx
|
||||
#endif
|
||||
#if HAVE_MX_ACCL_H
|
||||
, MxAccl *p_mx_accl
|
||||
#endif
|
||||
) :
|
||||
monitor_(monitor), terminate_(false)
|
||||
|
|
@ -660,6 +686,9 @@ AIThread::AIThread(const std::shared_ptr<Monitor> monitor
|
|||
#if HAVE_MEMX_H
|
||||
, memx(p_memx)
|
||||
#endif
|
||||
#if HAVE_MX_ACCL_H
|
||||
, mx_accl(p_mx_accl)
|
||||
#endif
|
||||
#if HAVE_QUADRA
|
||||
,drawbox_filter(nullptr)
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -24,19 +24,23 @@ extern "C" {
|
|||
#if HAVE_MEMX_H
|
||||
#include "zm_memx.h"
|
||||
#endif
|
||||
#if HAVE_MX_ACCL_H
|
||||
#include "zm_mx_accl.h"
|
||||
#endif
|
||||
|
||||
class Monitor;
|
||||
|
||||
class AIThread {
|
||||
public:
|
||||
explicit AIThread(const std::shared_ptr<Monitor> monitor
|
||||
#if HAVE_UNTETHER_H
|
||||
, SpeedAI *speedai
|
||||
explicit AIThread(const std::shared_ptr<Monitor> monitor, SpeedAI *speedai);
|
||||
#endif
|
||||
#if HAVE_MEMX_H
|
||||
, MemX *memx
|
||||
explicit AIThread(const std::shared_ptr<Monitor> monitor, MemX *memx);
|
||||
#endif
|
||||
#if HAVE_MX_ACCL_H
|
||||
explicit AIThread(const std::shared_ptr<Monitor> monitor, MxAccl *memx);
|
||||
#endif
|
||||
);
|
||||
~AIThread();
|
||||
AIThread(AIThread &rhs) = delete;
|
||||
AIThread(AIThread &&rhs) = delete;
|
||||
|
|
@ -62,6 +66,10 @@ class AIThread {
|
|||
MemX *memx;
|
||||
MemX::Job *memx_job;
|
||||
#endif
|
||||
#if HAVE_MX_ACCL_H
|
||||
MxAccl *mx_accl;
|
||||
MxAccl::Job *mx_accl_job;
|
||||
#endif
|
||||
#if HAVE_QUADRA
|
||||
Quadra::filter_worker *drawbox_filter;
|
||||
#endif
|
||||
|
|
|
|||
303
src/zm_memx.cpp
303
src/zm_memx.cpp
|
|
@ -36,7 +36,7 @@
|
|||
|
||||
|
||||
#define USE_THREAD 1
|
||||
#define BATCH_SIZE 10
|
||||
#define BATCH_SIZE 1
|
||||
|
||||
#ifdef HAVE_MEMX_H
|
||||
static const char *coco_classes[] = {"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
|
||||
|
|
@ -51,14 +51,17 @@ static const char *coco_classes[] = {"person", "bicycle", "car", "motorcycle", "
|
|||
"clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"};
|
||||
|
||||
MemX::MemX() :
|
||||
accl(),
|
||||
model_id(0),
|
||||
input_port(0),
|
||||
output_port(0),
|
||||
obj_thresh(0.25),
|
||||
nms_thresh(0.45),
|
||||
terminate_(false),
|
||||
batchSize(0),
|
||||
inSize(0),
|
||||
outSize(0),
|
||||
count(0)
|
||||
{
|
||||
image_size = av_image_get_buffer_size(AV_PIX_FMT_RGB24, MODEL_WIDTH, MODEL_HEIGHT, 32);
|
||||
}
|
||||
|
||||
MemX::~MemX() {
|
||||
|
|
@ -68,6 +71,7 @@ MemX::~MemX() {
|
|||
delete jobs.front();
|
||||
jobs.pop_front();
|
||||
}
|
||||
memx_close(model_id);
|
||||
}
|
||||
|
||||
bool MemX::setup(
|
||||
|
|
@ -77,15 +81,35 @@ bool MemX::setup(
|
|||
// Load and launch module_
|
||||
Debug(1, "MemX: Loading model %s", model_file.c_str());
|
||||
|
||||
std::vector<int> device_ids = {0};
|
||||
std::array<bool, 2> use_model_shape = {false, false};
|
||||
//std::vector<int> device_ids = {0};
|
||||
//std::array<bool, 2> use_model_shape = {false, false};
|
||||
|
||||
uint8_t model_id = 0;
|
||||
// Create the accelerator object and load the DFP model
|
||||
accl = new MX::Runtime::MxAccl(filesystem::path(model_file), device_ids, use_model_shape);
|
||||
// bind MPU device group 0 as MX3 to model 0
|
||||
memx_status status = memx_open(model_id, 0, MEMX_DEVICE_CASCADE_PLUS);
|
||||
if (status == 0) {
|
||||
Debug(1, "Success opening MX3");
|
||||
} else {
|
||||
Error("Failure openping MX3");
|
||||
return false;
|
||||
}
|
||||
|
||||
//int dfp_tag = accl.connect_dfp(model_file.c_str());
|
||||
model_info = accl->get_model_info(0);
|
||||
batchSize = 1; // only 1 issupported
|
||||
// download weight memory and 1st model within DFP file to device
|
||||
status = memx_download_model(model_id, model_file.c_str(), 0, MEMX_DOWNLOAD_TYPE_WTMEM_AND_MODEL);
|
||||
|
||||
if (status == 0) {
|
||||
// read back flow 0 (port 0) input feature map shape for debug information
|
||||
memx_get_ifmap_size(0, 0, &input_height, &input_width, &input_depth, &input_channels, &input_format);
|
||||
// for example: shape = (224, 224, 1, 3), format = 1
|
||||
Debug(1, "ifmap shape = (%d, %d, %d, %d), format = %d", input_height, input_width, input_depth, input_channels, input_format);
|
||||
memx_get_ofmap_size(0, 0, &output_height, &output_width, &output_depth, &output_channels, &output_format);
|
||||
Debug(1, "ofmap shape = (%d, %d, %d, %d), format = %d", output_height, output_width, output_depth, output_channels, output_format);
|
||||
image_size = av_image_get_buffer_size(AV_PIX_FMT_RGB24, input_width, input_height, 0);
|
||||
}
|
||||
// Set confidence threshold and process detection results
|
||||
yolov8_handle = std::make_unique<YOLOv8>(input_width, input_height, input_channels, 0, 0);
|
||||
yolov8_handle->SetConfidenceThreshold(obj_thresh);
|
||||
#if USE_THREAD
|
||||
thread_ = std::thread(&MemX::Run, this);
|
||||
#endif
|
||||
|
|
@ -93,40 +117,143 @@ bool MemX::setup(
|
|||
}
|
||||
|
||||
void MemX::Run() {
|
||||
|
||||
memx_status status = memx_set_stream_enable(model_id, 0);
|
||||
|
||||
while (!(terminate_ or zm_terminate)) {
|
||||
|
||||
Debug(1, "MemX locking, queue size %zu", send_queue.size());
|
||||
std::unique_lock<std::mutex> lck(mutex_);
|
||||
while (send_queue.size() < BATCH_SIZE and ! zm_terminate) {
|
||||
Debug(1, "MemX waiting, queue size %zu", send_queue.size());
|
||||
condition_.wait(lck);
|
||||
}
|
||||
if (send_queue.size() >= BATCH_SIZE) {
|
||||
Debug(1, " batch");
|
||||
for (int i = BATCH_SIZE; i > 0 ; i--) {
|
||||
Job *job = send_queue.front();
|
||||
Job *job = nullptr;
|
||||
{
|
||||
Debug(1, "MemX locking, queue size %zu", send_queue.size());
|
||||
std::unique_lock<std::mutex> lck(mutex_);
|
||||
while (send_queue.size() < BATCH_SIZE and ! zm_terminate) {
|
||||
Debug(1, "MemX waiting, queue size %zu", send_queue.size());
|
||||
condition_.wait(lck);
|
||||
}
|
||||
if (send_queue.size() >= BATCH_SIZE) {
|
||||
Debug(1, " batch");
|
||||
job = send_queue.front();
|
||||
send_queue.pop_front();
|
||||
std::unique_lock<std::mutex> job_lck(job->mutex_);
|
||||
} else {
|
||||
Debug(1, "Not batch");
|
||||
}
|
||||
} // end scope for lock
|
||||
|
||||
Debug(3, "MemX::enqueue");
|
||||
SystemTimePoint starttime = std::chrono::system_clock::now();
|
||||
int err = 0;
|
||||
if (err) {
|
||||
//Error("Failed enqueue %s", uai_err_string(err));
|
||||
} else {
|
||||
SystemTimePoint endtime = std::chrono::system_clock::now();
|
||||
if (endtime - starttime > Milliseconds(40)) {
|
||||
Warning("MemX enqueue is too slow: %.3f seconds", FPSeconds(endtime - starttime).count());
|
||||
} else {
|
||||
Debug(3, "MemX enqueue took: %.3f seconds", FPSeconds(endtime - starttime).count());
|
||||
}
|
||||
}
|
||||
job->notify();
|
||||
} // end foreach job in batch
|
||||
} else {
|
||||
Debug(1, "Not batch");
|
||||
} // end if BATCH
|
||||
if (job) {
|
||||
job->lock();
|
||||
inference(job);
|
||||
|
||||
Debug(1, "notifying");
|
||||
job->notify();
|
||||
Debug(1, "unlocking");
|
||||
job->unlock();
|
||||
Debug(1, "Done notifying");
|
||||
job = nullptr;
|
||||
} // end if job
|
||||
} // end while forever
|
||||
|
||||
status = memx_set_stream_disable(model_id, 0);
|
||||
}
|
||||
|
||||
#if 0
|
||||
// Function to process model output and get bounding boxes
|
||||
nlohmann::json MemX::get_detections(float *ofmap, int num_boxes, const cv::Mat &inframe,
|
||||
std::vector<Box> &filtered_boxes,
|
||||
std::vector<std::vector<float>> &filtered_mask_coefs)
|
||||
{
|
||||
|
||||
nlohmann::json coco_predictions = nlohmann::json::array();
|
||||
|
||||
std::vector<Box> all_boxes;
|
||||
std::vector<float> all_scores;
|
||||
std::vector<std::vector<float>> mask_coefs;
|
||||
|
||||
// Iterate through the model outputs
|
||||
for (int i = 0; i < num_boxes; ++i)
|
||||
{
|
||||
float *feature_ptr = ofmap_t_ptr + i * num_features;
|
||||
|
||||
// get best class information
|
||||
float confidence;
|
||||
int class_id;
|
||||
get_best_class_info(feature_ptr, confidence, class_id);
|
||||
|
||||
if (confidence > conf_thresh)
|
||||
{
|
||||
// Extract and scale the bounding box coordinates
|
||||
float x0 = feature_ptr[0];
|
||||
float y0 = feature_ptr[1];
|
||||
float w = feature_ptr[2];
|
||||
float h = feature_ptr[3];
|
||||
// Convert boxes from center format (cxcywh) to corner format (xyxy)
|
||||
int x1 = static_cast<int>(x0 - 0.5f * w);
|
||||
int y1 = static_cast<int>(y0 - 0.5f * h);
|
||||
int x2 = x1 + w;
|
||||
int y2 = y1 + h;
|
||||
|
||||
// Rescale boxes to original image size
|
||||
x1 = (x1 - pad_w) / scale_ratio;
|
||||
x2 = (x2 - pad_w) / scale_ratio;
|
||||
y1 = (y1 - pad_h) / scale_ratio;
|
||||
y2 = (y2 - pad_h) / scale_ratio;
|
||||
|
||||
// Clamp box boundaries to image dimensions
|
||||
x1 = std::max(0, std::min(x1, ori_image_width - 1));
|
||||
x2 = std::max(0, std::min(x2, ori_image_width - 1));
|
||||
y1 = std::max(0, std::min(y1, ori_image_height - 1));
|
||||
y2 = std::max(0, std::min(y2, ori_image_height - 1));
|
||||
|
||||
// Add detected box to the list
|
||||
all_boxes.emplace_back(x1, y1, x2, y2, class_id, confidence);
|
||||
all_scores.emplace_back(confidence);
|
||||
cv_boxes.emplace_back(x1, y1, x2 - x1, y2 - y1);
|
||||
|
||||
// Add detected mask coefficient to the list
|
||||
std::vector<float> mask_coef(feature_ptr + 4 + num_class, feature_ptr + num_features);
|
||||
mask_coefs.push_back(mask_coef);
|
||||
}
|
||||
}
|
||||
o // Apply Non-Maximum Suppression (NMS) to remove overlapping boxes
|
||||
std::vector<int> nms_result;
|
||||
if (!cv_boxes.empty()) {
|
||||
cv::dnn::NMSBoxes(cv_boxes, all_scores, conf_thresh, nms_thresh, nms_result);
|
||||
for (int idx : nms_result)
|
||||
{
|
||||
filtered_boxes.push_back(all_boxes[idx]);
|
||||
filtered_mask_coefs.push_back(mask_coefs[idx]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
memx_status MemX::inference(Job *job) {
|
||||
const int timeout = 200; // 200 ms
|
||||
Debug(3, "MemX::enqueue");
|
||||
SystemTimePoint starttime = std::chrono::system_clock::now();
|
||||
memx_status status = memx_stream_ifmap(model_id, input_port, static_cast<void *>(job->ifmap.data()), timeout);
|
||||
|
||||
if (status) {
|
||||
Error("Failed enqueue %d", status);
|
||||
return status;
|
||||
} else {
|
||||
SystemTimePoint endtime = std::chrono::system_clock::now();
|
||||
if (endtime - starttime > Milliseconds(40)) {
|
||||
Warning("MemX enqueue is too slow: %.3f seconds", FPSeconds(endtime - starttime).count());
|
||||
} else {
|
||||
Debug(3, "MemX enqueue took: %.3f seconds", FPSeconds(endtime - starttime).count());
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Read output feature map from device after inference
|
||||
if (0 == status) {
|
||||
Debug(1, "Reading ofmap");
|
||||
status = memx_stream_ofmap(model_id, output_port, static_cast<void *>(job->ofmap.data()), timeout);
|
||||
Debug(1, "Status from reading ofmap %d", status);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
MemX::Job * MemX::send_image(Job *job, Image *image) {
|
||||
|
|
@ -145,26 +272,65 @@ MemX::Job * MemX::send_packet(Job *job, std::shared_ptr<ZMPacket> packet) {
|
|||
}
|
||||
|
||||
MemX::Job * MemX::get_job() {
|
||||
Job *job = new Job(accl);
|
||||
Job *job = new Job();
|
||||
AVFrame *frame = job->scaled_frame.get();
|
||||
|
||||
job->scaled_frame->width = input_width;
|
||||
job->scaled_frame->height = input_height;
|
||||
job->scaled_frame->format = AV_PIX_FMT_RGB24;
|
||||
|
||||
Debug(1, "sw scale: scaled frame data 0x%lx, linesize %d/%d/%d/%d, height %d",
|
||||
(unsigned long)frame->data, frame->linesize[0], frame->linesize[1],
|
||||
frame->linesize[2], frame->linesize[3], frame->height);
|
||||
|
||||
#if 1
|
||||
if (av_frame_get_buffer(job->scaled_frame.get(), 32)) {
|
||||
Error("cannot allocate scaled frame buffer");
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
|
||||
job->ifmap.resize(input_width*input_width*input_depth*input_channels);
|
||||
job->ofmap.resize(output_width*output_width*output_depth*output_channels);
|
||||
|
||||
#if 0
|
||||
AVBufferRef *ref = av_buffer_create(reinterpret_cast<uint8_t *>(&job->ifmap),
|
||||
input_width*input_width*input_depth*input_channels,
|
||||
dont_free, /* Free callback */
|
||||
nullptr, /* opaque */
|
||||
0 /* flags */
|
||||
);
|
||||
if (!ref) {
|
||||
Warning("Failed to create av_buffer");
|
||||
}
|
||||
job->scaled_frame->buf[0] = ref;
|
||||
|
||||
int rc_size = av_image_fill_arrays(
|
||||
job->scaled_frame->data, job->scaled_frame->linesize,
|
||||
reinterpret_cast<uint8_t *>(&job->ifmap), AV_PIX_FMT_RGB24, input_width, input_height,
|
||||
0 //alignment
|
||||
);
|
||||
if (rc_size < 0) {
|
||||
Error("Problem setting up data pointers into image %s", av_make_error_string(rc_size).c_str());
|
||||
} else {
|
||||
Debug(1, "Image.Populate frame rc/size %d %dx%d", rc_size, job->scaled_frame->width, job->scaled_frame->height);
|
||||
}
|
||||
#endif
|
||||
|
||||
std::unique_lock<std::mutex> lck(mutex_);
|
||||
|
||||
jobs.push_back(job);
|
||||
return job;
|
||||
}
|
||||
} // end MemX::Job * MemX::get_job()
|
||||
|
||||
MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) {
|
||||
count++;
|
||||
Debug(1, "MemX::send_frame %d", count);
|
||||
|
||||
SystemTimePoint starttime = std::chrono::system_clock::now();
|
||||
|
||||
job->sw_scale_ctx = sws_getCachedContext(job->sw_scale_ctx,
|
||||
avframe->width, avframe->height, static_cast<AVPixelFormat>(avframe->format),
|
||||
MODEL_WIDTH, MODEL_HEIGHT, AV_PIX_FMT_RGB24,
|
||||
input_width, input_height, AV_PIX_FMT_RGB24,
|
||||
SWS_BICUBIC, nullptr, nullptr, nullptr);
|
||||
|
||||
//Debug(1, "Start scale");
|
||||
|
|
@ -176,8 +342,53 @@ MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) {
|
|||
avframe->linesize[2], avframe->linesize[3], avframe->height, job->scaled_frame->linesize[0]);
|
||||
return nullptr;
|
||||
}
|
||||
job->m_width_rescale = ((float)MODEL_WIDTH / (float)avframe->width);
|
||||
job->m_height_rescale = ((float)MODEL_HEIGHT / (float)avframe->height);
|
||||
job->m_width_rescale = ((float)input_width / (float)avframe->width);
|
||||
job->m_height_rescale = ((float)input_height / (float)avframe->height);
|
||||
|
||||
std::vector<float *> accl_input_data;
|
||||
accl_input_data.resize(input_width*input_height*input_depth*input_channels);
|
||||
|
||||
#if 0
|
||||
yolov8_handle->PreProcess(job->scaled_frame->data[0], input_width, input_height, accl_input_data);
|
||||
|
||||
#else
|
||||
int offset = 0; //padding_height_ * input_width_ * 3;
|
||||
float *buffer_ptr = job->ifmap.data() + offset; // YOLOv8 has 1 input
|
||||
uint8_t *input_buffer_ptr = job->scaled_frame->data[0];
|
||||
|
||||
for (int row = 0; row < input_height; ++row) {
|
||||
for (int col = 0; col < input_width; ++col) {
|
||||
#pragma omp simd
|
||||
for(int i=0; i < 3; i++) {
|
||||
buffer_ptr[0] = input_buffer_ptr[0] / (float)255.0; // red
|
||||
buffer_ptr[1] = input_buffer_ptr[1] / (float)255.0; // green
|
||||
buffer_ptr[2] = input_buffer_ptr[2] / (float)255.0; // blue
|
||||
}
|
||||
buffer_ptr += 3;
|
||||
input_buffer_ptr += 3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
inference(job);
|
||||
YOLOv8Result result;
|
||||
//yolov8_handle->PostProcess(job->ofmap, result);
|
||||
#else
|
||||
// Prevents processing?
|
||||
job->lock();
|
||||
//std::unique_lock<std::mutex> job_lck(job->mutex_);
|
||||
{
|
||||
Debug(1, "Locking");
|
||||
std::unique_lock<std::mutex> lck(mutex_);
|
||||
send_queue.push_back(job);
|
||||
condition_.notify_all();
|
||||
}
|
||||
Debug(1, "Waiting for inference");
|
||||
job->wait();
|
||||
Debug(1, "Done Waiting");
|
||||
#endif
|
||||
starttime = std::chrono::system_clock::now();
|
||||
|
||||
SystemTimePoint endtime = std::chrono::system_clock::now();
|
||||
if (endtime - starttime > Milliseconds(60)) {
|
||||
|
|
@ -190,10 +401,11 @@ MemX::Job * MemX::send_frame(Job *job, AVFrame *avframe) {
|
|||
|
||||
const nlohmann::json MemX::receive_detections(Job *job, float object_threshold) {
|
||||
nlohmann::json coco_object;
|
||||
coco_object = convert_predictions_to_coco_format(job->predictions_buffer, job->m_width_rescale, job->m_height_rescale, object_threshold);
|
||||
coco_object = convert_predictions_to_coco_format(job->ofmap, job->m_width_rescale, job->m_height_rescale, object_threshold);
|
||||
return coco_object;
|
||||
}
|
||||
|
||||
#if 0
|
||||
nlohmann::json MemX::convert_predictions_to_coco_format(
|
||||
const std::vector<float>& predictions,
|
||||
float m_width_rescale,
|
||||
|
|
@ -238,5 +450,6 @@ nlohmann::json MemX::convert_predictions_to_coco_format(
|
|||
}
|
||||
return coco_predictions;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -20,19 +20,22 @@ extern "C" {
|
|||
}
|
||||
|
||||
#ifdef HAVE_MEMX_H
|
||||
//#include "memx/memx.h"
|
||||
#include <memx/accl/MxAccl.h>
|
||||
|
||||
// FIXME move model stuff to it's ownfile
|
||||
#define MODEL_WIDTH 640
|
||||
#define MODEL_HEIGHT 640
|
||||
// * 6; // 256 boxes, each with 6 elements [l, t, r, b, class, score]
|
||||
#define NUM_NMS_PREDICTIONS 256
|
||||
#include "memx/memx.h"
|
||||
#include "zm_memx_yolov8.h"
|
||||
|
||||
class MemX {
|
||||
private:
|
||||
MX::Runtime::MxAccl *accl;
|
||||
MX::Types::MxModelInfo model_info;
|
||||
std::unique_ptr<YOLOv8> yolov8_handle;
|
||||
|
||||
uint8_t model_id;
|
||||
uint8_t input_port;
|
||||
uint8_t output_port;
|
||||
|
||||
float obj_thresh = 0.25;
|
||||
float nms_thresh = 0.45;
|
||||
|
||||
int32_t input_height, input_width, input_depth, input_channels, input_format;
|
||||
int32_t output_height, output_width, output_depth, output_channels, output_format;
|
||||
|
||||
std::mutex mutex_;
|
||||
std::condition_variable condition_;
|
||||
|
|
@ -50,7 +53,6 @@ class MemX {
|
|||
public:
|
||||
class Job {
|
||||
private:
|
||||
MX::Runtime::MxAccl *m_accl;
|
||||
public:
|
||||
|
||||
int index;
|
||||
|
|
@ -58,13 +60,15 @@ class MemX {
|
|||
float m_width_rescale;
|
||||
float m_height_rescale;
|
||||
SwsContext *sw_scale_ctx;
|
||||
std::vector<float> predictions_buffer;
|
||||
|
||||
std::vector<float> ifmap;
|
||||
std::vector<float> ofmap;
|
||||
|
||||
std::mutex mutex_;
|
||||
std::unique_lock<std::mutex> lck_;
|
||||
std::condition_variable condition_;
|
||||
|
||||
Job(MX::Runtime::MxAccl *p_accl) :
|
||||
m_accl(p_accl),
|
||||
Job() :
|
||||
index(0),
|
||||
m_width_rescale(1.0),
|
||||
m_height_rescale(1.0),
|
||||
|
|
@ -72,10 +76,6 @@ class MemX {
|
|||
lck_(mutex_, std::defer_lock)
|
||||
{
|
||||
scaled_frame = av_frame_ptr(zm_av_frame_alloc());
|
||||
scaled_frame->width = MODEL_WIDTH;
|
||||
scaled_frame->height = MODEL_HEIGHT;
|
||||
scaled_frame->format = AV_PIX_FMT_RGB24;
|
||||
predictions_buffer.resize(NUM_NMS_PREDICTIONS*6);
|
||||
};
|
||||
~Job() {
|
||||
if (sw_scale_ctx) {
|
||||
|
|
@ -85,14 +85,12 @@ class MemX {
|
|||
void setFrame(AVFrame *frame) {
|
||||
}
|
||||
Job(Job &&in) :
|
||||
m_accl(in.m_accl),
|
||||
index(in.index),
|
||||
scaled_frame(std::move(in.scaled_frame)),
|
||||
m_width_rescale(in.m_width_rescale),
|
||||
m_height_rescale(in.m_height_rescale)
|
||||
{
|
||||
Debug(1, "In move");
|
||||
//in.scaled_frame = nullptr;
|
||||
}
|
||||
void lock() {
|
||||
lck_.lock();
|
||||
|
|
@ -117,6 +115,7 @@ class MemX {
|
|||
Job * send_packet(Job *job, std::shared_ptr<ZMPacket>);
|
||||
Job * send_image(Job *job, Image *image);
|
||||
Job * send_frame(Job *job, AVFrame *);
|
||||
memx_status inference(Job *job);
|
||||
|
||||
const nlohmann::json receive_detections(Job *job, float threshold);
|
||||
nlohmann::json convert_predictions_to_coco_format(const std::vector<float>& predictions, float, float, float threshold);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,313 @@
|
|||
#include "zm_memx_yolov8.h"
|
||||
#include "zm_logger.h"
|
||||
#include <stdexcept>
|
||||
|
||||
YOLOv8::YOLOv8(size_t input_width, size_t input_height, size_t input_channel, float confidence_thresh, float iou_thresh) {
|
||||
Init(input_width, input_height, input_channel, confidence_thresh, iou_thresh, COCO_CLASS_NUMBER, COCO_NAMES);
|
||||
}
|
||||
|
||||
YOLOv8::YOLOv8(size_t input_width, size_t input_height, size_t input_channel,
|
||||
float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels)
|
||||
{
|
||||
Init(input_width, input_height, input_channel, confidence_thresh, iou_thresh, class_count, class_labels);
|
||||
}
|
||||
|
||||
void YOLOv8::Init(size_t accl_input_width, size_t accl_input_height, size_t accl_input_channel,
|
||||
float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels)
|
||||
{
|
||||
if (confidence_thresh < 0.0f || confidence_thresh > 1.0f || iou_thresh < 0.0f || iou_thresh > 1.0f) {
|
||||
throw std::invalid_argument("Confidence and IOU threshold must be between 0.0 and 1.0.");
|
||||
}
|
||||
|
||||
if (accl_input_width != accl_input_height) {
|
||||
throw std::invalid_argument("The YOLOv8 api requires model input to have equal width and height dimensions."
|
||||
"Currently, inconsistent input dimensions are not supported.");
|
||||
}
|
||||
|
||||
accl_input_width_ = accl_input_width;
|
||||
accl_input_height_ = accl_input_height;
|
||||
accl_input_channel_ = accl_input_channel;
|
||||
confidence_thresh_ = confidence_thresh;
|
||||
iou_thresh_ = iou_thresh;
|
||||
class_labels_ = class_labels;
|
||||
class_count_ = class_count;
|
||||
valid_input_ = false;
|
||||
|
||||
yolo_post_layers_[0] = {
|
||||
.coordinate_ofmap_flow_id = 0,
|
||||
.confidence_ofmap_flow_id = 1,
|
||||
.width = accl_input_width_ / 8, // L0_HW, 640 / 8 = 80
|
||||
.height = accl_input_height_ / 8, // L0_HW, 640 / 8 = 80
|
||||
.ratio = 8,
|
||||
.coordinate_fmap_size = 64,
|
||||
};
|
||||
|
||||
yolo_post_layers_[1] = {
|
||||
.coordinate_ofmap_flow_id = 2,
|
||||
.confidence_ofmap_flow_id = 3,
|
||||
.width = accl_input_width_ / 16, // L1_HW, 640 / 16 = 40
|
||||
.height = accl_input_height_ / 16, // L1_HW, 640 / 16 = 40
|
||||
.ratio = 16,
|
||||
.coordinate_fmap_size = 64,
|
||||
};
|
||||
|
||||
yolo_post_layers_[2] = {
|
||||
.coordinate_ofmap_flow_id = 4,
|
||||
.confidence_ofmap_flow_id = 5,
|
||||
.width = accl_input_width_ / 32, // L2_HW, 640 / 32 = 20
|
||||
.height = accl_input_height_ / 32, // L2_HW, 640 / 32 = 20
|
||||
.ratio = 32,
|
||||
.coordinate_fmap_size = 64,
|
||||
};
|
||||
}
|
||||
|
||||
YOLOv8::~YOLOv8()
|
||||
{
|
||||
}
|
||||
|
||||
float YOLOv8::GetConfidenceThreshold() {
|
||||
std::lock_guard<std::mutex> guard(confidence_mutex_);
|
||||
return confidence_thresh_;
|
||||
}
|
||||
|
||||
void YOLOv8::SetConfidenceThreshold(float confidence) {
|
||||
if (confidence < 0.0f || confidence > 1.0f) {
|
||||
throw std::invalid_argument("Confidence threshold must be between 0.0 and 1.0.");
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> guard(confidence_mutex_);
|
||||
confidence_thresh_ = confidence;
|
||||
}
|
||||
|
||||
bool YOLOv8::IsHorizontalInput(int disp_width, int disp_height) {
|
||||
if (disp_height > disp_width) {
|
||||
Error("Invalid display image: only horizontal images are supported.");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void YOLOv8::ComputePadding(int disp_width, int disp_height) {
|
||||
if (!IsHorizontalInput(disp_width, disp_height))
|
||||
return;
|
||||
|
||||
// accl_input_height_ is equal to accl_input_width_
|
||||
letterbox_ratio_ = (float)accl_input_height_ / mxutil_max(disp_width, disp_height);
|
||||
|
||||
letterbox_width_ = disp_width * letterbox_ratio_;
|
||||
letterbox_height_ = disp_height * letterbox_ratio_;
|
||||
|
||||
padding_width_ = (accl_input_width_ - letterbox_width_) / 2;
|
||||
padding_height_ = (accl_input_height_ - letterbox_height_) / 2;
|
||||
|
||||
valid_input_ = true;
|
||||
}
|
||||
|
||||
void YOLOv8::PreProcess(uint8_t *rgb_data, int image_width, int image_height, std::vector<float *> input_buffers) {
|
||||
if (!valid_input_) {
|
||||
throw std::runtime_error("Make sure to call ComputePadding() before further processing.");
|
||||
}
|
||||
|
||||
int offset = padding_height_ * accl_input_width_ * 3;
|
||||
float *buffer_ptr = input_buffers[0] + offset; // YOLOv8 has 1 input
|
||||
|
||||
#if 0
|
||||
cv::Mat src_img = cv::Mat(image_height, image_width, CV_8UC3, rgb_data);
|
||||
cv::Mat resized_img;
|
||||
cv::resize(src_img, resized_img, cv::Size(letterbox_width_, letterbox_height_), 0, 0, cv::INTER_LINEAR);
|
||||
|
||||
|
||||
for (int row = 0; row < resized_img.rows; ++row) {
|
||||
const cv::Vec3b *pixel_row = resized_img.ptr<cv::Vec3b>(row);
|
||||
|
||||
for (int col = 0; col < resized_img.cols; ++col) {
|
||||
const cv::Vec3b &pixelValue = pixel_row[col];
|
||||
|
||||
#pragma omp simd
|
||||
for(int i=0; i < 3; i++) {
|
||||
buffer_ptr[0] = pixelValue[0] / (float)255.0; // red
|
||||
buffer_ptr[1] = pixelValue[1] / (float)255.0; // green
|
||||
buffer_ptr[2] = pixelValue[2] / (float)255.0; // blue
|
||||
}
|
||||
buffer_ptr += 3;
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
uint8_t *input_buffer_ptr = rgb_data;
|
||||
|
||||
for (int row = 0; row < image_height; ++row) {
|
||||
for (int col = 0; col < image_width; ++col) {
|
||||
#pragma omp simd
|
||||
for(int i=0; i < 3; i++) {
|
||||
buffer_ptr[0] = input_buffer_ptr[0] / (float)255.0; // red
|
||||
buffer_ptr[1] = input_buffer_ptr[1] / (float)255.0; // green
|
||||
buffer_ptr[2] = input_buffer_ptr[2] / (float)255.0; // blue
|
||||
}
|
||||
buffer_ptr += 3;
|
||||
input_buffer_ptr += 3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void YOLOv8::GetDetection(
|
||||
std::queue<BBox> &bboxes,
|
||||
int layer_id,
|
||||
float *confidence_cell_buf,
|
||||
float *coordinate_cell_buf,
|
||||
int row, int col, float *confs_tmp)
|
||||
{
|
||||
// process confidence score
|
||||
float best_label_score = -1.0;
|
||||
int best_label;
|
||||
|
||||
// Force use of SIMD to calculate sigmoid.
|
||||
//
|
||||
// The space tradeoff of the temp result buffer
|
||||
// is more than worth the speedup / CPU usage reduction
|
||||
// (~75% CPU [down from 90% due to other changes] down to 59% CPU on my laptop)
|
||||
#pragma omp simd
|
||||
for (size_t label = 0; label < class_count_; label++) {
|
||||
confs_tmp[label] = mxutil_prepost_sigmoid(confidence_cell_buf[label]);
|
||||
}
|
||||
// no way to avoid O(n) here, though
|
||||
for (size_t label = 0; label < class_count_; label++) {
|
||||
if (confs_tmp[label] > best_label_score) {
|
||||
best_label_score = confs_tmp[label];
|
||||
best_label = label;
|
||||
}
|
||||
}
|
||||
|
||||
if (best_label_score < confidence_thresh_)
|
||||
return;
|
||||
|
||||
std::vector<float> feature_value{};
|
||||
|
||||
for (int channel = 0; channel < 4; channel++) { // split 64 into 4*16
|
||||
float value = 0.0;
|
||||
float *feature_buf = coordinate_cell_buf + channel * 16;
|
||||
float softmax_sum = 0.0;
|
||||
float local_max = feature_buf[0];
|
||||
|
||||
// apply softmax and weighted sum
|
||||
for (int i = 1; i < 16; i++) {
|
||||
if (feature_buf[i] > local_max)
|
||||
local_max = feature_buf[i];
|
||||
}
|
||||
|
||||
// more SIMD hints
|
||||
#pragma omp simd reduction(+:softmax_sum)
|
||||
for (int i = 0; i < 16; i++) {
|
||||
softmax_sum += expf(feature_buf[i] - local_max);
|
||||
}
|
||||
|
||||
// more SIMD hints
|
||||
#pragma omp simd reduction(+:value)
|
||||
for (int i = 0; i < 16; i++) {
|
||||
value += ((float)i * (float)(expf(feature_buf[i] - local_max) / softmax_sum));
|
||||
}
|
||||
feature_value.push_back(value);
|
||||
}
|
||||
|
||||
float center_x, center_y, w, h;
|
||||
|
||||
center_x = (feature_value[2] - feature_value[0] + 2 * (0.5 + ((float)col))) * 0.5 * yolo_post_layers_[layer_id].ratio;
|
||||
center_y = (feature_value[3] - feature_value[1] + 2 * (0.5 + ((float)row))) * 0.5 * yolo_post_layers_[layer_id].ratio;
|
||||
w = (feature_value[2] + feature_value[0]) * yolo_post_layers_[layer_id].ratio;
|
||||
h = (feature_value[3] + feature_value[1]) * yolo_post_layers_[layer_id].ratio;
|
||||
|
||||
// printf("[Layer%d] (%0.3f) %s\t: %.2f\t,%.2f\t,%.2f\t,%.2f\t]\n", layer_id, best_label_score, class_labels_[best_label], center_x, center_y, w, h);
|
||||
|
||||
float min_x = mxutil_max(center_x - 0.5 * w, .0);
|
||||
float min_y = mxutil_max(center_y - 0.5 * h, .0);
|
||||
float max_x = mxutil_min(center_x + 0.5 * w, accl_input_width_);
|
||||
float max_y = mxutil_min(center_y + 0.5 * h, accl_input_height_);
|
||||
|
||||
BBox bbox(best_label, best_label_score, min_x, min_y, max_x, max_y);
|
||||
|
||||
non_maximum_suppression(bboxes, bbox, iou_thresh_);
|
||||
}
|
||||
|
||||
void YOLOv8::PostProcess(std::vector<float *> output_buffers, YOLOv8Result &result) {
|
||||
if (!valid_input_) {
|
||||
throw std::runtime_error("Make sure to call ComputePadding() before further processing.");
|
||||
}
|
||||
|
||||
if (output_buffers.empty()) {
|
||||
throw std::invalid_argument("output_buffers cannot be null.");
|
||||
}
|
||||
|
||||
float *confs_tmp = new float[class_count_];
|
||||
|
||||
for (size_t layer_id = 0; layer_id < kNumPostProcessLayers; ++layer_id) {
|
||||
const auto &layer = yolo_post_layers_[layer_id];
|
||||
const int confidence_floats_per_row = (layer.width * class_count_); // 80 x 80
|
||||
const int coordinate_floats_per_row = (layer.width * layer.coordinate_fmap_size); // 80 x 64
|
||||
|
||||
const int conf_id = layer.confidence_ofmap_flow_id;
|
||||
const int coord_id = layer.coordinate_ofmap_flow_id;
|
||||
|
||||
float *confidence_base = output_buffers[conf_id];
|
||||
float *coordinate_base = output_buffers[coord_id];
|
||||
|
||||
if (!confidence_base || !coordinate_base) {
|
||||
throw std::invalid_argument("One or more output buffers are null.");
|
||||
}
|
||||
|
||||
for (size_t row = 0; row < layer.height; row++) {
|
||||
float *confidence_row_buf = confidence_base + row * confidence_floats_per_row;
|
||||
float *coordinate_row_buf = coordinate_base + row * coordinate_floats_per_row;
|
||||
|
||||
for (size_t col = 0; col < layer.width; col++) {
|
||||
float *confidence_cell_buf = confidence_row_buf + col * class_count_;
|
||||
float *coordinate_cell_buf = coordinate_row_buf + col * layer.coordinate_fmap_size;
|
||||
GetDetection(result.bboxes, layer_id, confidence_cell_buf, coordinate_cell_buf, row, col, confs_tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete [] confs_tmp;
|
||||
}
|
||||
|
||||
void YOLOv8::CalculateBboxParams(const float *feature_values, int layer_id, int row, int col,
|
||||
float ¢er_x, float ¢er_y, float &box_width, float &box_height)
|
||||
{
|
||||
const float ratio = yolo_post_layers_[layer_id].ratio;
|
||||
|
||||
center_x = (feature_values[2] - feature_values[0] + 2.0f * (0.5f + static_cast<float>(col))) * 0.5f * ratio;
|
||||
center_y = (feature_values[3] - feature_values[1] + 2.0f * (0.5f + static_cast<float>(row))) * 0.5f * ratio;
|
||||
box_width = (feature_values[2] + feature_values[0]) * ratio;
|
||||
box_height = (feature_values[3] + feature_values[1]) * ratio;
|
||||
}
|
||||
|
||||
void YOLOv8::GetFeatureValues(const float *coordinate_buffer, float *feature_values)
|
||||
{
|
||||
constexpr int kNumChannels = 4;
|
||||
constexpr int kChannelSize = 16;
|
||||
|
||||
for (int channel = 0; channel < kNumChannels; ++channel) {
|
||||
const float *feature_buf = coordinate_buffer + channel * kChannelSize;
|
||||
float local_max = feature_buf[0];
|
||||
|
||||
// Find local max
|
||||
for (int i = 1; i < kChannelSize; ++i) {
|
||||
if (feature_buf[i] > local_max) {
|
||||
local_max = feature_buf[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Compute softmax and weighted sum in a single pass
|
||||
float softmax_sum = 0.0f;
|
||||
float value = 0.0f;
|
||||
float temp[kChannelSize];
|
||||
for (int i = 0; i < kChannelSize; ++i) {
|
||||
temp[i] = feature_buf[i] - local_max;
|
||||
float exp_val = expf(temp[i]);
|
||||
softmax_sum += exp_val;
|
||||
value += static_cast<float>(i) * exp_val;
|
||||
}
|
||||
feature_values[channel] = value / softmax_sum;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
#pragma once
|
||||
|
||||
#include "zm_memx_yolo_core.h"
|
||||
#include <queue>
|
||||
#include <mutex>
|
||||
|
||||
class YOLOv8 {
|
||||
public:
|
||||
/** @brief Constructor for using official 80 classes COCO dataset. */
|
||||
YOLOv8(size_t input_width, size_t input_height, size_t input_channel,
|
||||
float confidence_thresh, float iou_thresh);
|
||||
|
||||
/** @brief Constructor for using a customized dataset. */
|
||||
YOLOv8(size_t input_width, size_t input_height, size_t input_channel,
|
||||
float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels);
|
||||
|
||||
~YOLOv8();
|
||||
|
||||
/**
|
||||
* @brief Pre-process the input image data for YOLOv8 model inference.
|
||||
* @param rgb_data Pointer to the input image data in RGB format.
|
||||
* @param image_width Width of the display image.
|
||||
* @param image_height Height of the display image.
|
||||
* @param input_buffers Vector of pointers to the input buffers used by the accelerator for processing. Should be zero-initialized.
|
||||
*/
|
||||
void PreProcess(uint8_t *rgb_data, int image_width, int image_height, std::vector<float *> input_buffers);
|
||||
|
||||
/**
|
||||
* @brief Post-process the output data from the YOLOv8 model.
|
||||
* @param output_buffers Vector of pointers to the output buffers from the accelerator.
|
||||
* @param result Reference to the structure where the decoded bounding box results will be stored.
|
||||
*/
|
||||
void PostProcess(std::vector<float *> output_buffers, YOLOv8Result &result);
|
||||
|
||||
/** @brief Clear the detection results. */
|
||||
void ClearDetectionResults(YOLOv8Result &result);
|
||||
|
||||
/** @brief Setters and getters for the confidence threshold. */
|
||||
void SetConfidenceThreshold(float confidence);
|
||||
float GetConfidenceThreshold();
|
||||
|
||||
/** @brief Compute padding values for letterboxing from the display image. */
|
||||
void ComputePadding(int disp_width, int disp_height);
|
||||
|
||||
/** @brief Ensure the input dimensions are valid for horizontal display images only. */
|
||||
bool IsHorizontalInput(int disp_width, int disp_height);
|
||||
|
||||
private:
|
||||
/** @brief Structure representing per-layer information of YOLOv8 output. */
|
||||
struct LayerParams
|
||||
{
|
||||
uint8_t coordinate_ofmap_flow_id;
|
||||
uint8_t confidence_ofmap_flow_id;
|
||||
size_t width;
|
||||
size_t height;
|
||||
size_t ratio;
|
||||
size_t coordinate_fmap_size;
|
||||
};
|
||||
|
||||
/** @brief Initialization method to set up YOLOv8 model parameters. */
|
||||
void Init(size_t accl_input_width, size_t accl_input_height, size_t accl_input_channel,
|
||||
float confidence_thresh, float iou_thresh, size_t class_count, const char **class_labels);
|
||||
|
||||
/** @brief Helper methods for building detections from model output. */
|
||||
void GetDetection(std::queue<BBox> &bounding_boxes, int layer_id, float *confidence_buffer,
|
||||
float *coordinate_buffer, int row, int col, float *confs_tmp);
|
||||
|
||||
/** @brief Helper methods for calculating bounding box parameters from feature values. */
|
||||
void CalculateBboxParams(const float *feature_values, int layer_id, int row, int col,
|
||||
float ¢er_x, float ¢er_y, float &box_width, float &box_height);
|
||||
void GetFeatureValues(const float *coordinate_buffer, float *feature_values);
|
||||
|
||||
static constexpr size_t kNumPostProcessLayers = 3;
|
||||
struct LayerParams yolo_post_layers_[kNumPostProcessLayers];
|
||||
|
||||
// Model-specific parameters.
|
||||
const char **class_labels_;
|
||||
size_t class_count_;
|
||||
size_t accl_input_width_; // Input width to accelerator, obtained by dfp.
|
||||
size_t accl_input_height_; // Input height to accelerator, obtained by dfp.
|
||||
size_t accl_input_channel_; // Input channel to accelerator, obtained by dfp.
|
||||
|
||||
// Confidence and IOU thresholds.
|
||||
std::mutex confidence_mutex_;
|
||||
float confidence_thresh_;
|
||||
float iou_thresh_;
|
||||
|
||||
// Letterbox ratio and padding.
|
||||
float letterbox_ratio_;
|
||||
int letterbox_width_;
|
||||
int letterbox_height_;
|
||||
int padding_height_;
|
||||
int padding_width_;
|
||||
bool valid_input_;
|
||||
};
|
||||
|
|
@ -10,6 +10,7 @@
|
|||
#cmakedefine HAVE_EXECINFO_H 1
|
||||
#cmakedefine HAVE_UNTETHER_H 1
|
||||
#cmakedefine HAVE_MEMX_H 1
|
||||
#cmakedefine HAVE_MX_ACCL_H 1
|
||||
#cmakedefine HAVE_QUADRA 1
|
||||
#cmakedefine HAVE_UCONTEXT_H 1
|
||||
#cmakedefine HAVE_SYS_SENDFILE_H 1
|
||||
|
|
|
|||
Loading…
Reference in New Issue