diff --git a/Dockerfile.openvino b/Dockerfile.openvino index 676d8ef66aa..34e6c2260a8 100644 --- a/Dockerfile.openvino +++ b/Dockerfile.openvino @@ -296,4 +296,4 @@ ENV GLOG_logtostderr=1 ENV LD_LIBRARY_PATH=/usr/local/lib:/opt/intel/openvino/runtime/lib/intel64/:/opt/intel/openvino/runtime/3rdparty/tbb/lib/ WORKDIR /mediapipe -## End of demos image ######################################################### +## End of demos image ######################################################### \ No newline at end of file diff --git a/build_desktop_examples.sh b/build_desktop_examples.sh index 83e25fa4948..d9cfc1aeee1 100755 --- a/build_desktop_examples.sh +++ b/build_desktop_examples.sh @@ -93,6 +93,9 @@ for app in ${apps}; do target="${app}:extract_yt8m_features" echo "Skipping target ${target}" continue + elif [[ "${target_name}" == "bytetrack" ]]; then + echo "Skipping target ${target_name} ." + continue else target="${app}:${target_name}_cpu" fi diff --git a/mediapipe/calculators/openvino/BUILD b/mediapipe/calculators/openvino/BUILD index 84968c0f5ae..1ab7f33d48d 100644 --- a/mediapipe/calculators/openvino/BUILD +++ b/mediapipe/calculators/openvino/BUILD @@ -181,6 +181,32 @@ cc_library( alwayslink = 1, ) +cc_library( + name = "openvino_yolox_tensors_to_detections_calculator", + srcs = ["openvino_yolox_tensors_to_detections_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":openvino_yolox_tensors_to_detections_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:detection_cc_proto", + "//mediapipe/framework/formats:location_data_cc_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "//third_party:openvino", + ], + alwayslink = 1, +) + +mediapipe_proto_library( + name = "openvino_yolox_tensors_to_detections_calculator_proto", + srcs = ["openvino_yolox_tensors_to_detections_calculator.proto"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], +) + # To run this with native GPU on Linux, use: # bazel test //mediapipe/calculators/tflite:tflite_inference_calculator_test --copt=-DTFLITE_GPU_EXTRA_GLES_DEPS --copt=-DMESA_EGL_NO_X11_HEADERS --copt=-DEGL_NO_X11 --config=grte_v5 --test_strategy=local cc_test( diff --git a/mediapipe/calculators/openvino/openvino_yolox_tensors_to_detections_calculator.cc b/mediapipe/calculators/openvino/openvino_yolox_tensors_to_detections_calculator.cc new file mode 100644 index 00000000000..a095ed07c71 --- /dev/null +++ b/mediapipe/calculators/openvino/openvino_yolox_tensors_to_detections_calculator.cc @@ -0,0 +1,154 @@ +#include "mediapipe/calculators/openvino/openvino_yolox_tensors_to_detections_calculator.pb.h" + +#include +#include +#include +#include + +#include + +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/framework/formats/location_data.pb.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" + +namespace mediapipe { + +// Converts YOLOX output OV tensors to MediaPipe Detections. +// +// YOLOX output tensor shape: [1, 3549, 85] +// Layout: [batch, num_boxes, num_attrs] +// decode_in_inference=True: sigmoid already applied, coords already decoded +// Attributes: [cx, cy, w, h, obj_score, class_0, ..., class_79] +// Coordinates are in PIXEL space (input image 416x416), NOT normalized +// +// Input: +// TENSORS: Vector of ov::Tensor +// Output: +// DETECTIONS: Vector of Detection protos + +class OpenVINOYoloXTensorsToDetectionsCalculator : public CalculatorBase { + public: + static absl::Status GetContract(CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + if (cc->Inputs().HasTag("TENSORS")) + cc->Inputs().Tag("TENSORS").Set>(); + if (cc->Outputs().HasTag("DETECTIONS")) + cc->Outputs().Tag("DETECTIONS").Set>(); + return absl::OkStatus(); + } + + absl::Status Open(CalculatorContext* cc) override { + const auto& options = + cc->Options(); + min_thresh_ = options.has_conf_thresh() ? options.conf_thresh() : 0.1f; + input_size_ = options.has_input_size() ? options.input_size() : 416.0f; + cc->SetOffset(TimestampDiff(0)); + return absl::OkStatus(); + } + + absl::Status Process(CalculatorContext* cc) override { + if (cc->Inputs().Tag("TENSORS").IsEmpty()) + return absl::OkStatus(); + + const auto& tensors = + cc->Inputs().Tag("TENSORS").Get>(); + RET_CHECK(!tensors.empty()); + const ov::Tensor& raw = tensors[0]; + RET_CHECK(raw.get_element_type() == ov::element::f32); + + const auto& shape = raw.get_shape(); + RET_CHECK_EQ(shape.size(), 3u); + RET_CHECK_EQ(shape[0], 1u); + // Actual layout from TFLite: [1, 85, 3549] — attr-first + RET_CHECK_EQ(shape[1], static_cast(num_attrs_)); // 85 + RET_CHECK_EQ(shape[2], static_cast(num_boxes_)); // 3549 + + const float* data = raw.data(); + RET_CHECK(data != nullptr); + + // Accessor for [attr, box] layout + auto at = [&](int attr, int box) -> float { + return data[attr * num_boxes_ + box]; + }; + + // Grid strides for 416x416: + // stride 8 → 52x52 = 2704 boxes + // stride 16 → 26x26 = 676 boxes + // stride 32 → 13x13 = 169 boxes + // total = 3549 + struct GridInfo { int stride; int cols; int rows; }; + const std::vector grids = { + {8, 52, 52}, + {16, 26, 26}, + {32, 13, 13}, + }; + + auto output_detections = absl::make_unique>(); + + int box_idx = 0; + for (const auto& g : grids) { + for (int gy = 0; gy < g.rows; ++gy) { + for (int gx = 0; gx < g.cols; ++gx, ++box_idx) { + + // Sigmoid already baked in by TFLite Logistic ops + float obj = at(4, box_idx); + + int best_cls = 0; + float best_cls_score = 0.0f; + for (int c = 0; c < num_classes_; ++c) { + float s = at(5 + c, box_idx); + if (s > best_cls_score) { best_cls_score = s; best_cls = c; } + } + + float score = obj * best_cls_score; + if (score < min_thresh_) continue; + LOG(INFO)<<"CLASS: "<set_format(LocationData::RELATIVE_BOUNDING_BOX); + auto* bbox = loc->mutable_relative_bounding_box(); + bbox->set_xmin(x1); + bbox->set_ymin(y1); + bbox->set_width(x2 - x1); + bbox->set_height(y2 - y1); + det.add_score(score); + det.add_label_id(best_cls); + output_detections->emplace_back(det); + } + } + } + + cc->Outputs().Tag("DETECTIONS") + .Add(output_detections.release(), cc->InputTimestamp()); + return absl::OkStatus(); + } + private: + const int num_boxes_ = 3549; + const int num_attrs_ = 85; + const int num_classes_ = 80; + float input_size_; + float min_thresh_; +}; + +REGISTER_CALCULATOR(OpenVINOYoloXTensorsToDetectionsCalculator); + +} // namespace mediapipe \ No newline at end of file diff --git a/mediapipe/calculators/openvino/openvino_yolox_tensors_to_detections_calculator.proto b/mediapipe/calculators/openvino/openvino_yolox_tensors_to_detections_calculator.proto new file mode 100644 index 00000000000..b33ee04e004 --- /dev/null +++ b/mediapipe/calculators/openvino/openvino_yolox_tensors_to_detections_calculator.proto @@ -0,0 +1,15 @@ +syntax = 'proto2'; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message OpenVINOYoloXTensorsToDetectionsCalculatorOptions { + extend .mediapipe.CalculatorOptions { + optional OpenVINOYoloXTensorsToDetectionsCalculatorOptions ext = 211376657; + } + + optional float conf_thresh = 1 [default = 0.10]; + optional float input_size = 3 [default = 416.0]; + +} diff --git a/mediapipe/calculators/ovms/BUILD b/mediapipe/calculators/ovms/BUILD index b32f05e5e1e..88f293c5900 100644 --- a/mediapipe/calculators/ovms/BUILD +++ b/mediapipe/calculators/ovms/BUILD @@ -27,6 +27,7 @@ cc_library( "//mediapipe/calculators/openvino:openvino_tensors_to_detections_calculator_cc_proto", "//mediapipe/calculators/openvino:openvino_converter_calculator_cc_proto", "//mediapipe/calculators/openvino:openvino_converter_calculator", + "//mediapipe/calculators/openvino:openvino_yolox_tensors_to_detections_calculator", "//mediapipe/calculators/openvino:openvino_tensors_to_classification_calculator", "//mediapipe/calculators/openvino:openvino_tensors_to_detections_calculator", ":modelapiovmsadapter", diff --git a/mediapipe/calculators/ovms/config.json b/mediapipe/calculators/ovms/config.json index c9583066d9e..b6ef27777a7 100644 --- a/mediapipe/calculators/ovms/config.json +++ b/mediapipe/calculators/ovms/config.json @@ -4,6 +4,12 @@ "name":"ssdlite_object_detection", "base_path":"/mediapipe/mediapipe/models/ovms/ssdlite_object_detection" } + }, + { + "config":{ + "name":"yoloxt_float32", + "base_path":"/mediapipe/mediapipe/models/ovms/yoloxt_float32" + } } ] } \ No newline at end of file diff --git a/mediapipe/calculators/tflite/BUILD b/mediapipe/calculators/tflite/BUILD index 2526088d18f..9a700d0fc4d 100644 --- a/mediapipe/calculators/tflite/BUILD +++ b/mediapipe/calculators/tflite/BUILD @@ -85,6 +85,31 @@ mediapipe_proto_library( ], ) +cc_library( + name = "yolox_tensors_to_detections_calculator", + srcs = ["yolox_tensors_to_detections_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + ":yolox_tensors_to_detections_calculator_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:detection_cc_proto", + "//mediapipe/framework/formats:location_data_cc_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/framework/port:status", + "@org_tensorflow//tensorflow/lite:framework", + ], + alwayslink = 1, +) + +mediapipe_proto_library( + name = "yolox_tensors_to_detections_calculator_proto", + srcs = ["yolox_tensors_to_detections_calculator.proto"], + deps = [ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], +) + mediapipe_proto_library( name = "tflite_tensors_to_landmarks_calculator_proto", srcs = ["tflite_tensors_to_landmarks_calculator.proto"], diff --git a/mediapipe/calculators/tflite/yolox_tensors_to_detections_calculator.cc b/mediapipe/calculators/tflite/yolox_tensors_to_detections_calculator.cc new file mode 100644 index 00000000000..d943acd1f9d --- /dev/null +++ b/mediapipe/calculators/tflite/yolox_tensors_to_detections_calculator.cc @@ -0,0 +1,141 @@ +#include "mediapipe/calculators/tflite/yolox_tensors_to_detections_calculator.pb.h" + +#include +#include +#include + +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/framework/formats/location_data.pb.h" +#include "mediapipe/framework/port/ret_check.h" +#include "mediapipe/framework/port/status.h" +#include "tensorflow/lite/interpreter.h" + +namespace mediapipe { + +class YoloXTensorsToDetectionsCalculator : public CalculatorBase { + public: + static absl::Status GetContract(CalculatorContract* cc) { + RET_CHECK(!cc->Inputs().GetTags().empty()); + RET_CHECK(!cc->Outputs().GetTags().empty()); + if (cc->Inputs().HasTag("TENSORS")) + cc->Inputs().Tag("TENSORS").Set>(); + if (cc->Outputs().HasTag("DETECTIONS")) + cc->Outputs().Tag("DETECTIONS").Set>(); + return absl::OkStatus(); + } + + absl::Status Open(CalculatorContext* cc) override { + const auto& options = + cc->Options(); + min_thresh_ = options.has_conf_thresh() ? options.conf_thresh() : 0.1f; + input_size_ = options.has_input_size() ? options.input_size() : 416.0f; + cc->SetOffset(TimestampDiff(0)); + return absl::OkStatus(); + } + + absl::Status Process(CalculatorContext* cc) override { + LOG(INFO) << "YOLOXT2D process called"; + if (cc->Inputs().Tag("TENSORS").IsEmpty()) + return absl::OkStatus(); + + const auto& tensors = + cc->Inputs().Tag("TENSORS").Get>(); + RET_CHECK(!tensors.empty()); + + const TfLiteTensor& raw = tensors[0]; + RET_CHECK_EQ(raw.type, kTfLiteFloat32); + RET_CHECK_EQ(raw.dims->size, 3); + RET_CHECK_EQ(raw.dims->data[0], 1); + RET_CHECK_EQ(raw.dims->data[1], num_attrs_); + RET_CHECK_EQ(raw.dims->data[2], num_boxes_); + + const float* data = raw.data.f; + RET_CHECK(data != nullptr); + + std::vector buffer(data, data + num_attrs_ * num_boxes_); + + auto at = [&](int attr, int box) -> float { + return buffer[attr * num_boxes_ + box]; + }; + + struct GridInfo { int stride; int cols; int rows; }; + const std::vector grids = { + {8, 52, 52}, + {16, 26, 26}, + {32, 13, 13}, + }; + + auto output_detections = absl::make_unique>(); + + int box_idx = 0; + for (const auto& g : grids) { + for (int gy = 0; gy < g.rows; ++gy) { + for (int gx = 0; gx < g.cols; ++gx, ++box_idx) { + + // Sigmoid already baked in by TFLite Logistic ops + float obj = at(4, box_idx); + + int best_cls = 0; + float best_cls_score = 0.0f; + for (int c = 0; c < num_classes_; ++c) { + float s = at(5 + c, box_idx); + if (s > best_cls_score) { best_cls_score = s; best_cls = c; } + } + + float score = obj * best_cls_score; + if (score < min_thresh_) continue; + LOG(INFO)<<"CLASS: "<set_format(LocationData::RELATIVE_BOUNDING_BOX); + auto* bbox = loc->mutable_relative_bounding_box(); + bbox->set_xmin(x1); + bbox->set_ymin(y1); + bbox->set_width(x2 - x1); + bbox->set_height(y2 - y1); + det.add_score(score); + det.add_label_id(best_cls); + output_detections->emplace_back(det); + } + } + } + + cc->Outputs().Tag("DETECTIONS") + .Add(output_detections.release(), cc->InputTimestamp()); + return absl::OkStatus(); + } + + absl::Status Close(CalculatorContext* cc) override { // ✅ added for symmetry + return absl::OkStatus(); + } + + private: + + const int num_boxes_ = 3549; + const int num_attrs_ = 85; + const int num_classes_= 80; + + float input_size_; + float min_thresh_; +}; + +REGISTER_CALCULATOR(YoloXTensorsToDetectionsCalculator); + +} // namespace mediapipe \ No newline at end of file diff --git a/mediapipe/calculators/tflite/yolox_tensors_to_detections_calculator.proto b/mediapipe/calculators/tflite/yolox_tensors_to_detections_calculator.proto new file mode 100644 index 00000000000..8ca90c514af --- /dev/null +++ b/mediapipe/calculators/tflite/yolox_tensors_to_detections_calculator.proto @@ -0,0 +1,14 @@ +syntax = 'proto2'; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message YoloXTensorsToDetectionsCalculatorOptions { + extend .mediapipe.CalculatorOptions { + optional YoloXTensorsToDetectionsCalculatorOptions ext = 211376660; + } + + optional float conf_thresh = 1 [default = 0.10]; + optional float input_size = 3 [default = 416.0]; +} diff --git a/mediapipe/calculators/util/BUILD b/mediapipe/calculators/util/BUILD index 3f0fdb6a237..af73620a0f0 100644 --- a/mediapipe/calculators/util/BUILD +++ b/mediapipe/calculators/util/BUILD @@ -199,6 +199,30 @@ cc_test( ], ) +cc_library( + name = "detection_color_by_id_calculator", + srcs = ["detection_color_by_id_calculator.cc"], + deps=[ + ":detection_color_by_id_calculator_cc_proto", + "//mediapipe/util:render_data_cc_proto", + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:detection_cc_proto", + "//mediapipe/framework/formats:location_data_cc_proto", + "//mediapipe/framework/port:ret_check", + "//mediapipe/util:color_cc_proto", + ], + alwayslink = 1, +) + +mediapipe_proto_library( + name = "detection_color_by_id_calculator_proto", + srcs = ["detection_color_by_id_calculator.proto"], + deps = [ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], +) + cc_library( name = "packet_latency_calculator", srcs = ["packet_latency_calculator.cc"], diff --git a/mediapipe/calculators/util/detection_color_by_id_calculator.cc b/mediapipe/calculators/util/detection_color_by_id_calculator.cc new file mode 100644 index 00000000000..7ca77a7eddc --- /dev/null +++ b/mediapipe/calculators/util/detection_color_by_id_calculator.cc @@ -0,0 +1,111 @@ +#include "mediapipe/calculators/util/detection_color_by_id_calculator.pb.h" + +#include + +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/util/render_data.pb.h" +#include "mediapipe/util/color.pb.h" + +namespace mediapipe { + +class DetectionColorByIdCalculator : public CalculatorBase { + public: + static absl::Status GetContract(CalculatorContract* cc) { + cc->Inputs().Tag("DETECTIONS").Set>(); + cc->Outputs().Tag("RENDER_DATA").Set(); + return absl::OkStatus(); + } + absl::Status Open(CalculatorContext* cc) override{ + const auto& options = + cc->Options(); + thickness_ = options.has_thickness() ? options.thickness() : 4.0f; + saturation_ = options.has_saturation() ? options.saturation() : 0.85f; + value_ = options.has_value() ? options.value() : 0.95f; + return absl::OkStatus(); + } + absl::Status Process(CalculatorContext* cc) override { + const auto& detections = + cc->Inputs().Tag("DETECTIONS").Get>(); + + auto render_data = std::make_unique(); + + for (const auto& det : detections) { + int id = det.detection_id(); + mediapipe::Color color = IdToColor(id); + + const auto& bbox = det.location_data().relative_bounding_box(); + + // ── 1. Bounding box ────────────────────────────────────────────────── + { + auto* a = render_data->add_render_annotations(); + *a->mutable_color() = color; + a->set_thickness(thickness_); + + auto* rect = a->mutable_rectangle(); + rect->set_left(bbox.xmin()); + rect->set_top(bbox.ymin()); + rect->set_right(bbox.xmin() + bbox.width()); + rect->set_bottom(bbox.ymin() + bbox.height()); + rect->set_normalized(true); + } + + // ── 2. Label ─────────────────────────────────────────────────────── + { + auto* a = render_data->add_render_annotations(); + *a->mutable_color() = color; // same color as box + a->set_thickness(thickness_-1.0f); + + auto* text = a->mutable_text(); + std::string label = "ID:" + std::to_string(id); + if (!det.label().empty()) + label += " " + det.label(0); + if (det.score_size() > 0) { + char buf[8]; + std::snprintf(buf, sizeof(buf), " %.2f", det.score(0)); + label += buf; + } + + text->set_display_text(label); + text->set_normalized(true); + text->set_left(bbox.xmin() + 0.005f); + text->set_baseline(bbox.ymin() + 0.04f); + text->set_font_height(0.035f); + } + } + + cc->Outputs().Tag("RENDER_DATA").Add(render_data.release(), + cc->InputTimestamp()); + return absl::OkStatus(); + } + private: + mediapipe::Color IdToColor(int id) { + // Golden angle ensures max visual distance between consecutive IDs + const float kGoldenAngle = 137.508f; + float hue = std::fmod(id * kGoldenAngle, 360.0f); + float chroma = value_ * saturation_; + float x = chroma * (1.0f - std::fabs(std::fmod(hue / 60.0f, 2.0f) - 1.0f)); + float m = value_ - chroma; + + float r = 0, g = 0, b = 0; + if (hue < 60) { r = chroma; g = x;} + else if (hue < 120) { r = x; g = chroma;} + else if (hue < 180) { g = chroma; b = x;} + else if (hue < 240) { g = x; b = chroma;} + else if (hue < 300) { r = x; b = chroma;} + else { r = chroma; b = x;} + + mediapipe::Color color; + color.set_r(static_cast((r + m) * 255)); + color.set_g(static_cast((g + m) * 255)); + color.set_b(static_cast((b + m) * 255)); + return color; +} + float thickness_ = 4.0f; + float saturation_ = 0.85f; + float value_ = 0.95f; +}; + +REGISTER_CALCULATOR(DetectionColorByIdCalculator); + +} // namespace mediapipe \ No newline at end of file diff --git a/mediapipe/calculators/util/detection_color_by_id_calculator.proto b/mediapipe/calculators/util/detection_color_by_id_calculator.proto new file mode 100644 index 00000000000..9461e84be88 --- /dev/null +++ b/mediapipe/calculators/util/detection_color_by_id_calculator.proto @@ -0,0 +1,15 @@ +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message DetectionColorByIdCalculatorOptions{ + // hue is calculated using track_id* 137.508f % 360, In order to get unique color for bbox based on id + extend CalculatorOptions { + optional DetectionColorByIdCalculatorOptions ext = 259397841; + } + optional float saturation = 1 [default = 0.85]; + optional float value = 2 [default = 0.95]; + optional float thickness = 3 [default = 5.0]; +} \ No newline at end of file diff --git a/mediapipe/examples/desktop/bytetrack/BUILD b/mediapipe/examples/desktop/bytetrack/BUILD new file mode 100644 index 00000000000..9f96f500968 --- /dev/null +++ b/mediapipe/examples/desktop/bytetrack/BUILD @@ -0,0 +1,28 @@ +licenses(["notice"]) + +package(default_visibility = ["//mediapipe/examples:__subpackages__"]) + + +cc_binary( + name = "bytetrack_final_cpu", + data = [ + "//mediapipe/models:yolox_tiny_float32", + "//mediapipe/models:coco_labels", + ], + deps = [ + "//mediapipe/examples/desktop:demo_run_graph_main", + "//mediapipe/graphs/bytetrack:bytetrack_cpu_calculators", + ], +) + +cc_binary( + name = "bytetrack_final_ovms", + data = [ + "//mediapipe/models:coco_labels", + ], + deps = [ + "@ovms//src:ovms_lib", + "//mediapipe/examples/desktop:demo_run_graph_main", + "//mediapipe/graphs/bytetrack:bytetrack_ovms_calculators", + ], +) diff --git a/mediapipe/examples/desktop/bytetrack/README.md b/mediapipe/examples/desktop/bytetrack/README.md new file mode 100644 index 00000000000..fd4b6ca3a23 --- /dev/null +++ b/mediapipe/examples/desktop/bytetrack/README.md @@ -0,0 +1,299 @@ +# ByteTrack Demo + +This project demonstrates real-time multi-object tracking using YOLOX-Tiny and ByteTrack using MediaPipe pipelines. + +The demos explore different inference backends and graph architectures. +## Model Installation + +For this demo, we use **YOLOX-Tiny**. + +* Download the COCO labels file [here](https://github.com/openvinotoolkit/open_model_zoo/blob/master/data/dataset_classes/coco_80cl.txt). + +> **Note** +> +> The ONNX weights of the YOLOX Tiny are downloaded from the following link: +> +> https://github.com/Megvii-BaseDetection/YOLOX/tree/main/demo/ONNXRuntime +> +> The model conversion process is as follows: +> +> 1. Install onnx2tf library by "pip install onnx2tf" +> 2. Convert the ONNX model to TensorFlow Lite (`.tflite`) format using `onnx2tf`. +> +> The provided `.tflite` model was generated using this conversion pipeline. +> The labels which are used in this demo are standard COCO labels. + +### Model Placement + +After downloading the model weights and labels file: + +1. Move `yolox_tiny_float32.tflite` to `mediapipe/models/ovms/yoloxt_float32/1/`. +2. Rename `coco_80cl.txt` to `coco_labels.txt`, and move it to `mediapipe/models/`. + +Create the following directory structure if it does not already exist: + +```text +mediapipe/ +└── models/ + ├── coco_labels.txt + ├── yolox_tiny_float32.tflite + └── ovms/ + └── yoloxt_float32/ + └── 1/ + └── yolox_tiny_float32.tflite +``` + +Once the files are placed correctly, you can proceed with building and running the demos. + + +## Available Demos + +> **Note:** `PassThroughRenderDataCalculator` is used during the initial development stage for debugging graph outputs. + +--- + +# 1. `bytetrack_final_cpu` + +This demo performs ByteTrack inference using a TensorFlow Lite YOLOX model. + +## Build + +```bash +bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 \ +mediapipe/examples/desktop/bytetrack:bytetrack_final_cpu +``` + +## Run + +```bash +bazel-bin/mediapipe/examples/desktop/bytetrack/bytetrack_final_cpu \ +--calculator_graph_config_file=mediapipe/graphs/bytetrack/bytetrack_cpu.pbtxt \ +--input_video_path=/mediapipe/mediapipe/examples/desktop/object_detection/test_video.mp4 \ +--output_video_path=/mediapipe/out_bt_cpu.mp4 +``` + +## Graph Flow + +```mermaid +%%{init: { +"flowchart": { + "nodeSpacing": 35, + "rankSpacing": 45, + "curve": "basis" +}, +"themeVariables": { + "fontSize": "15px" +} +}}%% + +flowchart TD + + A["input_video"] + + A ==> FL["FlowLimiterCalculator"] + + FL ==> B["ImageTransformationCalculator
640x640 Resize"] + + B ==> C["TfLiteConverterCalculator
Image → TfLiteTensor"] + + C ==> D["TfLiteInferenceCalculator
YOLOX Tiny TFLite"] + + D ==> E["YoloXTensorsToDetectionsCalculator
YOLOX Decode"] + + E ==> F["NonMaxSuppressionCalculator
IOU = 0.45"] + + F ==> G["DetectionLabelIdToTextCalculator
COCO Labels"] + + G ==> H["DetectionUniqueIdCalculator"] + + H ==> I["ByteTrackCalculator
Multi-Object Tracking"] + + I ==> J["DetectionColorByIdCalculator"] + + J ==> K["AnnotationOverlayCalculator"] + + A ==> K + + K ==> L["output_video"] + + I -. "FINISHED" .-> FL +``` + +--- + +# 2. `bytetrack_final_ovms` + +This demo performs ByteTrack inference using OpenVINO Model Server (OVMS). + +## Build + +```bash +bazel build -c opt --define MEDIAPIPE_DISABLE_GPU=1 \ +mediapipe/examples/desktop/bytetrack:bytetrack_final_ovms +``` + +## Run + +```bash +bazel-bin/mediapipe/examples/desktop/bytetrack/bytetrack_final_ovms \ +--calculator_graph_config_file=mediapipe/graphs/bytetrack/bytetrack_ovms.pbtxt \ +--input_video_path=/mediapipe/mediapipe/examples/desktop/object_detection/test_video.mp4 \ +--output_video_path=/mediapipe/out_bt_ovms.mp4 +``` + +## Graph Flow + +```mermaid +%%{init: { +"flowchart": { + "nodeSpacing": 35, + "rankSpacing": 45, + "curve": "basis" +}, +"themeVariables": { + "fontSize": "15px" +} +}}%% + +flowchart TD + + A["input_video"] + + A ==> B["ImageTransformationCalculator
416×416 Resize"] + + B ==> C["OpenVINOConverterCalculator
Image → OV Tensor"] + + S["OpenVINOModelServerSessionCalculator
OVMS Session"] + + S -.-> D + + C ==> D["OpenVINOInferenceCalculator
YOLOXt Inference"] + + D ==> E["OpenVINOYoloXTensorsToDetectionsCalculator
YOLOX Decode"] + + E ==> F["NonMaxSuppressionCalculator
IOU = 0.45"] + + F ==> G["DetectionLabelIdToTextCalculator
Label Mapping"] + + G ==> H["DetectionUniqueIdCalculator"] + + H ==> I["ByteTrackCalculator
Multi-Object Tracking"] + + I ==> J["DetectionColorByIdCalculator"] + + J ==> K["PassThroughRenderDataCalculator"] + + A ==> L["AnnotationOverlayCalculator"] + + K ==> L + + L ==> M["output_video"] +``` + +--- + +# Custom Calculators and Utilities + +## Tracking Utilities + +### [`matching_utils.h`](../../../graphs/bytetrack/calculators/matching_utils.h) + +Utility header containing helper functions used for matching and object association during ByteTrack execution. + +Implemented methods: + +- `ComputeIoU` — Computes IoU score between two detection boxes. +- `BuildIoUCostMatrix` — Builds the IoU cost matrix from detections and tracks. +- `FuseScore` — Computes fused score between cost matrix and detections. +- `LinearAssignment` — Performs linear assignment using the Jonker–Volgenant algorithm *(currently under development)*. + +--- + +## Kalman Filter + +### [`kalman_filter.cc`](../../../graphs/bytetrack/calculators/kalman_filter.cc) + +Implements the Kalman filter logic used by ByteTrack. + +The corresponding class structure is defined in: + +* [`kalman_filter.h`](../../../graphs/bytetrack/calculators/kalman_filter.h) + +Methods: + +- `Initiate` — Initializes the Kalman filter state for a new track. +- `Predict` — Predicts the next object state using the previous state. +- `Update` — Corrects the predicted state using the latest detection. +- `MultiPredict` — Performs batch prediction for multiple active tracks. + + + +## Base Tracking Object + +### [`basetrack.cc`](../../../graphs/bytetrack/calculators/basetrack.cc) + +Defines the base tracking abstraction used in ByteTrack. + +Class structure: + +- [`basetrack.h`](../../../graphs/bytetrack/calculators/basetrack.h) + + +## STrack Object + +### [`strack.cc`](../../../graphs/bytetrack/calculators/strack.cc) + +Defines the `STrack` object used by ByteTrack for managing individual tracked objects. + +Class structure: + +- [`strack.h`](../../../graphs/bytetrack/calculators/strack.h) + +Methods: + +- `Predict` — Predicts the next object position. +- `Activate` — Activates a new track from an unmatched detection. +- `ReActivate` — Re-activates a previously lost track. +- `Update` — Updates the track state using the latest matched detection. + + +## Main ByteTrack Calculator + +### [`bytetrack_calculator.cc`](../../../graphs/bytetrack/calculators/bytetrack_calculator.cc) + +Main calculator implementing the ByteTrack algorithm. + + +## YOLOX Tensor Decoders + +### [`yolox_tensors_to_detections_calculator.cc`](../../../calculators/tflite/yolox_tensors_to_detections_calculator.cc) + +Converts YOLOX TensorFlow Lite output tensors into MediaPipe `Detection` objects. + +### [`openvino_yolox_tensors_to_detections_calculator.cc`](../../../calculators/openvino/openvino_yolox_tensors_to_detections_calculator.cc) + +Converts YOLOX OpenVINO output tensors into MediaPipe `Detection` objects. + + +## Detection Visualization + +### [`detection_color_by_id_calculator.cc`](../../../calculators/util/detection_color_by_id_calculator.cc) + +Assigns a unique visualization color to detections based on tracking ID. + +- Hue is derived from the detection ID. +- saturation and value can be configured using calculator options i.e. + ``` + node { + calculator: "DetectionColorByIdCalculator" + input_stream: "DETECTIONS:tracked_detections" + output_stream: "RENDER_DATA:detections_render_data" + options: { + [mediapipe.DetectionColorByIdCalculatorOptions.ext] { + saturation: 0.85 + value: 0.95 + } + } + } + + ``` diff --git a/mediapipe/examples/desktop/bytetrack/palace.mp4 b/mediapipe/examples/desktop/bytetrack/palace.mp4 new file mode 100644 index 00000000000..db93ce2b41d Binary files /dev/null and b/mediapipe/examples/desktop/bytetrack/palace.mp4 differ diff --git a/mediapipe/graphs/bytetrack/BUILD b/mediapipe/graphs/bytetrack/BUILD new file mode 100644 index 00000000000..e0046075849 --- /dev/null +++ b/mediapipe/graphs/bytetrack/BUILD @@ -0,0 +1,52 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load( + "//mediapipe/framework/tool:mediapipe_graph.bzl", + "mediapipe_binary_graph", +) + +licenses(["notice"]) + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "bytetrack_cpu_calculators", + deps = [ + "//mediapipe/calculators/video:opencv_video_decoder_calculator", + "//mediapipe/calculators/core:flow_limiter_calculator", + "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/tflite:tflite_converter_calculator", + "//mediapipe/calculators/tflite:tflite_inference_calculator", + "//mediapipe/calculators/util:non_max_suppression_calculator", # NMS for YOLOX-N + "//mediapipe/calculators/util:annotation_overlay_calculator", + "//mediapipe/calculators/util:detection_label_id_to_text_calculator", + "//mediapipe/calculators/video:opencv_video_encoder_calculator", + "//mediapipe/graphs/bytetrack/calculators:bytetrack_calculators", + ], +) + +cc_library( + name = "bytetrack_ovms_calculators", + deps = [ + "//mediapipe/calculators/image:image_transformation_calculator", + "//mediapipe/calculators/ovms:ovms_calculator", + "//mediapipe/calculators/util:detection_label_id_to_text_calculator", + "//mediapipe/calculators/video:opencv_video_decoder_calculator", + "//mediapipe/calculators/video:opencv_video_encoder_calculator", + "//mediapipe/graphs/bytetrack/calculators:bytetrack_calculators", + "//mediapipe/calculators/util:non_max_suppression_calculator", # NMS for YOLOX-N + "//mediapipe/calculators/util:annotation_overlay_calculator", + ], +) \ No newline at end of file diff --git a/mediapipe/graphs/bytetrack/bytetrack_cpu.pbtxt b/mediapipe/graphs/bytetrack/bytetrack_cpu.pbtxt new file mode 100644 index 00000000000..3bcf292e7ac --- /dev/null +++ b/mediapipe/graphs/bytetrack/bytetrack_cpu.pbtxt @@ -0,0 +1,146 @@ +input_stream: "input_video" # ACTUAL INPUT +# input_side_packet: "INPUT_FILE_PATH:input_file_path" #simple_run_graph_main.cc +output_stream: "output_video" #ACTUAL OUTPUT + +node { + calculator: "FlowLimiterCalculator" + input_stream: "input_video" + input_stream: "FINISHED:tracked_detections" + input_stream_info: { + tag_index: "FINISHED" + back_edge: true + } + output_stream: "throttled_input_video" + node_options: { + [type.googleapis.com/mediapipe.FlowLimiterCalculatorOptions] { + max_in_flight: 1 + max_in_queue: 1 + } + } +} + +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE:throttled_input_video" + output_stream: "IMAGE:transformed_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 416 + output_height: 416 + } + } +} + +# Converts the transformed input image on CPU into an image tensor stored as a +# TfLiteTensor. +node { + calculator: "TfLiteConverterCalculator" + input_stream: "IMAGE:transformed_input_video" + output_stream: "TENSORS:image_tensor" + node_options: { + [type.googleapis.com/mediapipe.TfLiteConverterCalculatorOptions] { + use_custom_normalization: true + custom_div: 1.0 + custom_sub: 0.0 + } + } +} + +# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "TfLiteInferenceCalculator" + input_stream: "TENSORS:image_tensor" + output_stream: "TENSORS:tensors" + node_options: { + [type.googleapis.com/mediapipe.TfLiteInferenceCalculatorOptions] { + model_path: "mediapipe/models/yolox_tiny_float32.tflite" + } + } +} + +### WRITE YOLO SPECIFIC CALCULATORS + +node{ + calculator: "YoloXTensorsToDetectionsCalculator" + input_stream: "TENSORS:tensors" + output_stream: "DETECTIONS:detections" + node_options: { + [type.googleapis.com/mediapipe.YoloXTensorsToDetectionsCalculatorOptions] { + conf_thresh: 0.35 + } + } +} + +# Performs non-max suppression to remove excessive detections. +node { + calculator: "NonMaxSuppressionCalculator" + input_stream: "detections" + output_stream: "filtered_detections" + node_options: { + [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { + min_suppression_threshold: 0.45 + max_num_detections: 100 + overlap_type: INTERSECTION_OVER_UNION + return_empty_detections: true + } + } +} +# Maps detection label IDs to the corresponding label text. The label map is +# provided in the label_map_path option. +node { + calculator: "DetectionLabelIdToTextCalculator" + input_stream: "filtered_detections" + output_stream: "output_detections" + node_options: { + [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { + label_map_path: "mediapipe/models/coco_labels.txt" + } + } +} + +node { + calculator: "ByteTrackCalculator" + input_stream: "DETECTIONS:output_detections" + output_stream: "DETECTIONS:tracked_detections" + options: { + [mediapipe.ByteTrackCalculatorOptions.ext] { + track_high_threshold:0.4 + track_low_threshold:0.1 + new_track_threshold:0.25 + matching_threshold: 0.8 + track_buffer: 30 + fuse_score: true + } + } +} + +# Converts the detections to drawing primitives for annotation overlay. +node { + calculator: "DetectionColorByIdCalculator" + input_stream: "DETECTIONS:tracked_detections" + output_stream: "RENDER_DATA:detections_render_data" + options: { + [mediapipe.DetectionColorByIdCalculatorOptions.ext] { + saturation: 0.7 + value: 0.6 + } + } +} + + +#node { +# calculator: "PassThroughRenderDataCalculator" +# input_stream: "RENDER_DATA:detections_render_data" +# output_stream: "RENDER_DATA:debug_render_data" +#} + + +# Draws annotations and overlays them on top of the input images. +node { + calculator: "AnnotationOverlayCalculator" + input_stream: "IMAGE:input_video" + input_stream: "detections_render_data" + output_stream: "IMAGE:output_video" +} diff --git a/mediapipe/graphs/bytetrack/bytetrack_ovms.pbtxt b/mediapipe/graphs/bytetrack/bytetrack_ovms.pbtxt new file mode 100644 index 00000000000..4ab25f836a1 --- /dev/null +++ b/mediapipe/graphs/bytetrack/bytetrack_ovms.pbtxt @@ -0,0 +1,138 @@ + +input_stream: "input_video" +# input_side_packet: "INPUT_FILE_PATH:input_file_path" #simple_run_graph_main.cc +output_stream: "output_video" + +node: { + calculator: "ImageTransformationCalculator" + input_stream: "IMAGE:input_video" + output_stream: "IMAGE:transformed_input_video" + node_options: { + [type.googleapis.com/mediapipe.ImageTransformationCalculatorOptions] { + output_width: 416 + output_height: 416 + } + } +} + +node { + calculator: "OpenVINOConverterCalculator" + input_stream: "IMAGE:transformed_input_video" + output_stream: "TENSORS:image_tensor" + node_options: { + [type.googleapis.com/mediapipe.OpenVINOConverterCalculatorOptions] { + enable_normalization: true + use_custom_normalization: true + custom_div: 1.0 + custom_sub: 0.0 + } + } +} +# Runs a TensorFlow Lite model on CPU that takes an image tensor and outputs a +# vector of tensors representing, for instance, detection boxes/keypoints and +# scores. +node { + calculator: "OpenVINOModelServerSessionCalculator" + output_side_packet: "SESSION:session" + node_options: { + [type.googleapis.com/mediapipe.OpenVINOModelServerSessionCalculatorOptions]: { + servable_name: "yoloxt_float32" # servable name inside OVMS + servable_version: "1" + server_config: "mediapipe/calculators/ovms/config.json" + } + } +} + +node { + calculator: "OpenVINOInferenceCalculator" + input_side_packet: "SESSION:session" + input_stream: "OVTENSORS:image_tensor" + output_stream: "OVTENSORS2:detection_tensors" + node_options: { + [type.googleapis.com/mediapipe.OpenVINOInferenceCalculatorOptions]: { + input_order_list :["images"] + output_order_list :["output"] + } + } +} + +### WRITE YOLO SPECIFIC CALCULATORS + +node{ + calculator: "OpenVINOYoloXTensorsToDetectionsCalculator" + input_stream: "TENSORS:detection_tensors" + output_stream: "DETECTIONS:detections" + node_options: { + [type.googleapis.com/mediapipe.OpenVINOYoloXTensorsToDetectionsCalculatorOptions] { + conf_thresh: 0.1 + } + } +} + +# Performs non-max suppression to remove excessive detections. +node { + calculator: "NonMaxSuppressionCalculator" + input_stream: "detections" + output_stream: "filtered_detections" + node_options: { + [type.googleapis.com/mediapipe.NonMaxSuppressionCalculatorOptions] { + min_suppression_threshold: 0.65 + max_num_detections: 100 + overlap_type: INTERSECTION_OVER_UNION + return_empty_detections: true + } + } +} + + +# Maps detection label IDs to the corresponding label text. The label map is +# provided in the label_map_path option. +node { + calculator: "DetectionLabelIdToTextCalculator" + input_stream: "filtered_detections" + output_stream: "output_detections" + node_options: { + [type.googleapis.com/mediapipe.DetectionLabelIdToTextCalculatorOptions] { + label_map_path: "mediapipe/models/coco_labels.txt" + } + } +} + +node { + calculator: "ByteTrackCalculator" + input_stream: "DETECTIONS:output_detections" + output_stream: "DETECTIONS:tracked_detections" + options: { + [mediapipe.ByteTrackCalculatorOptions.ext] { + track_high_threshold:0.5 + track_low_threshold:0.1 + new_track_threshold:0.25 + matching_threshold: 0.8 + track_buffer: 30 + fuse_score: true + } + } +} + +# Converts the detections to drawing primitives for annotation overlay. +node { + calculator: "DetectionColorByIdCalculator" + input_stream: "DETECTIONS:tracked_detections" + output_stream: "RENDER_DATA:detections_render_data" +} + + +node { + calculator: "PassThroughRenderDataCalculator" + input_stream: "RENDER_DATA:detections_render_data" + output_stream: "RENDER_DATA:debug_render_data" +} + + +# Draws annotations and overlays them on top of the input images. +node { + calculator: "AnnotationOverlayCalculator" + input_stream: "IMAGE:input_video" + input_stream: "debug_render_data" + output_stream: "IMAGE:output_video" +} diff --git a/mediapipe/graphs/bytetrack/calculators/BUILD b/mediapipe/graphs/bytetrack/calculators/BUILD new file mode 100644 index 00000000000..3b681c23525 --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/BUILD @@ -0,0 +1,125 @@ +# Copyright 2019 The MediaPipe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("//mediapipe/framework/port:build_config.bzl", "mediapipe_proto_library") + +licenses(["notice"]) + +package(default_visibility = ["//visibility:public"]) + +cc_library( + name = "bytetrack_calculators", + deps = [ + "//mediapipe/calculators/util:detection_color_by_id_calculator", + "//mediapipe/calculators/tflite:yolox_tensors_to_detections_calculator", + ":kalman_filter", + ":kalman_matrices", + ":strack", + ":bytetrack_calculator", + ":render_data_passthrough", + ], + alwayslink = 1 +) + +mediapipe_proto_library( + name = "bytetrack_calculator_proto", + visibility = ["//visibility:public"], + srcs = ["bytetrack_calculator.proto"], + deps=[ + "//mediapipe/framework:calculator_options_proto", + "//mediapipe/framework:calculator_proto", + ], + alwayslink = 1 +) + +cc_library( + name = "kalman_matrices", + hdrs = ["kalman_matrices.h"], + deps = ["@eigen_archive//:eigen3"], + alwayslink = 1 +) + + +cc_library( + name = "matching_utils", + hdrs = ["matching_utils.h"], + deps = [ + "//mediapipe/util/tracking:box_tracker", + ], + alwayslink = 1 +) + +cc_library( + name = "kalman_filter", + srcs = ["kalman_filter.cc"], + hdrs = ["kalman_filter.h"], + deps = [ + ":kalman_matrices", + "//mediapipe/framework/formats:detection_cc_proto", + "@eigen_archive//:eigen3", + ], + alwayslink = 1 +) + +cc_library( + name = "basetrack", + srcs = ["basetrack.cc"], + hdrs = ["basetrack.h"], + alwayslink = 1 +) + +cc_library( + name = "strack", + srcs = ["strack.cc"], + hdrs = ["strack.h"], + deps = [ + ":kalman_filter", + ":basetrack", + "//mediapipe/framework/formats:detection_cc_proto", + "@eigen_archive//:eigen3", + ], + alwayslink = 1 +) + +cc_library( + name = "bytetrack_calculator", + srcs = ["bytetrack_calculator.cc"], + hdrs = ["bytetrack_calculator.h"], + deps = [ + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:detection_cc_proto", + ":bytetrack_calculator_cc_proto", + "//mediapipe/framework/port:status", + ":matching_utils", + ":strack", + ":kalman_filter", + ":kalman_matrices", + "@eigen_archive//:eigen3", + ], + alwayslink = 1, +) + +# This calculator is for debugging to check whether render data is correctly passed through the graph. +cc_library( + name = "render_data_passthrough", + srcs = ["render_data_passthrough_calculator.cc"], + visibility = ["//visibility:public"], + deps = [ + "//mediapipe/framework:calculator_framework", + "//mediapipe/framework/formats:detection_cc_proto", + "//mediapipe/framework/port:status", + "//mediapipe/util:render_data_cc_proto", + ], + alwayslink = 1 +) diff --git a/mediapipe/graphs/bytetrack/calculators/basetrack.cc b/mediapipe/graphs/bytetrack/calculators/basetrack.cc new file mode 100644 index 00000000000..2abbd5bd38d --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/basetrack.cc @@ -0,0 +1,10 @@ +// basetrack.cpp +#include "mediapipe/graphs/bytetrack/calculators/basetrack.h" + +namespace mediapipe { +namespace bytetrack { + +int BaseTrack::count_ = 0; + +} // namespace bytetrack +} // namespace mediapipe \ No newline at end of file diff --git a/mediapipe/graphs/bytetrack/calculators/basetrack.h b/mediapipe/graphs/bytetrack/calculators/basetrack.h new file mode 100644 index 00000000000..6a507f583eb --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/basetrack.h @@ -0,0 +1,40 @@ +#ifndef MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_BASETRACK_H_ +#define MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_BASETRACK_H_ + +namespace mediapipe { +namespace bytetrack { + +class BaseTrack { + public: + enum class TrackState { NEW, TRACKED, LOST, REMOVED }; + + // mirrors Python's next_id() — static counter owned here + static int next_id() { return ++count_; } + static void reset_id() { count_ = 0; } + + // Accessors + int track_id() const { return track_id_; } + int frame_id() const { return frame_id_; } + int start_frame() const { return start_frame_; } + float score() const { return score_; } + TrackState state() const { return state_; } + bool is_activated() const { return is_activated_; } + + void MarkLost() { state_ = TrackState::LOST; } + void MarkRemoved() { state_ = TrackState::REMOVED; } + + protected: + int track_id_ = 0; + int frame_id_ = 0; + int start_frame_ = 0; + float score_ = 0.f; + bool is_activated_ = false; + TrackState state_ = TrackState::NEW; + static int count_; + +}; + +} // namespace bytetrack +} // namespace mediapipe + +#endif \ No newline at end of file diff --git a/mediapipe/graphs/bytetrack/calculators/bytetrack_calculator.cc b/mediapipe/graphs/bytetrack/calculators/bytetrack_calculator.cc new file mode 100644 index 00000000000..0be5f182c1c --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/bytetrack_calculator.cc @@ -0,0 +1,452 @@ +#include "bytetrack_calculator.h" +#include "mediapipe/graphs/bytetrack/calculators/bytetrack_calculator.pb.h" + +#include +#include + +#include + +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/graphs/bytetrack/calculators/strack.h" +#include "mediapipe/graphs/bytetrack/calculators/kalman_filter.h" +#include "mediapipe/graphs/bytetrack/calculators/matching_utils.h" + + +// node { +// calculator: "ByteTrackCalculator" +// input_stream: "DETECTIONS:detections_with_id" +// output_stream: "DETECTIONS:tracked_detections" +// options: { +// [mediapipe.ByteTrackCalculatorOptions.ext] { +// track_high_threshold:0.5 +// track_low_threshold:0.1 +// new_track_threshold:0.25 +// matching_threshold: 0.75 +// track_buffer: 30 +// fuse_score: true +// } +// } +// } + +namespace mediapipe{ + REGISTER_CALCULATOR(ByteTrackCalculator); + + // DEBUG function + void printDists(std::vector> dists){ + LOG(INFO)<<"Inside dists debug function"; + for(auto i:dists){ + for(float j: i){ + LOG(INFO)< stracks){ + LOG(INFO)<<"stracks info"; + for(auto i:stracks){ + LOG(INFO)<<"State: "<(i.state())<<", Score: "<Inputs().Get("DETECTIONS",0).Set>(); + cc->Outputs().Get("DETECTIONS",0).Set>(); + return absl::OkStatus(); + } + + absl::Status ByteTrackCalculator::Open(CalculatorContext* cc){ + options_ = cc->Options(); + match_thresh_ = options_.matching_threshold(); + track_buffer_ = options_.track_buffer(); + track_high_thresh_ = options_.track_high_threshold(); + track_low_thresh_ = options_.track_low_threshold(); + new_track_thresh_ = options_.new_track_threshold(); + fuse_score_ = options_.fuse_score(); + return absl::OkStatus(); + } + + absl::Status ByteTrackCalculator::Process(CalculatorContext* cc){ + if(cc->Inputs().Get("DETECTIONS", 0).IsEmpty()){ + return absl::OkStatus(); // nothing to do + } + int64_t current_ts = cc->InputTimestamp().Microseconds(); + if (last_timestamp_ > 0 && frame_id_ > 1) { + float dt_sec = (current_ts - last_timestamp_) / 1e6f; + float instant_fps = 1.0f / dt_sec; + // Smooth it with a running average to avoid jitter + estimated_fps_ = 0.9f * estimated_fps_ + 0.1f * instant_fps; + LOG(INFO) << "Instant FPS : "<(estimated_fps_ / 30.0f * track_buffer_); + LOG(INFO) << "MAX TIME LOST: "<>(); + auto low_dets = std::make_unique>(); + + if (!cc->Inputs().Tag("DETECTIONS").IsEmpty()) { + const auto& input = cc->Inputs().Tag("DETECTIONS").Get>(); + for (const auto& detection : input) { + float score = detection.score(0); + if (score >= track_high_thresh_) high_dets->push_back(detection); + else if (score >= track_low_thresh_) low_dets->push_back(detection); + } + } + + std::vector activated_stracks; + std::vector refind_stracks; + std::vector lost_stracks; + std::vector removed_stracks; + + LOG(INFO) << "High dets: " << high_dets->size() + << " Low dets: " << low_dets->size(); + + // create tracks from high score detections + std::vector detections; + if( high_dets->size() > 0){ + for(auto& d:*high_dets){ + detections.emplace_back(d); + } + } + LOG(INFO)<<"Detections STrack list size: "< unconfirmed; + std::vector tracked_stracks; + for(auto& track:tracked_stracks_){ + if(!track.is_activated()){ + unconfirmed.push_back(&track); + }else{ + tracked_stracks.push_back(&track); + } + } + LOG(INFO) <<"TRACKED STRACK(tracked_stracks) SIZE: "< lost_ptrs; + for(auto& t : lost_stracks_) lost_ptrs.push_back(&t); + + auto strack_pool = JointStracks(tracked_stracks,lost_ptrs); + bytetrack::STrack::MultiPredict(strack_pool); + auto dists = BuildIoUCostMatrix(strack_pool,detections); + if(fuse_score_){ + dists = FuseScore(dists,detections); + } + // printDists(dists); + // gives 3 vectors + auto [matches,u_track,u_detection] = bytetrack::LinearAssignment(dists,match_thresh_); + + LOG(INFO)<<"First step association, " + <<" matches size: "<state() == bytetrack::BaseTrack::TrackState::TRACKED){ + track->Update(detections[idet],frame_id_); + activated_stracks.push_back(*track); + }else{ + track->ReActivate(det,frame_id_,false); + refind_stracks.push_back(*track); + } + } + LOG(INFO) << " After 1st assoc: activated=" << activated_stracks.size() + << " refind=" << refind_stracks.size(); + /////////////////////// Second association /////////////////////////// + std::vector detections_second; + if(low_dets->size()>0){ + for(const auto& d:*low_dets){ + detections_second.emplace_back(d); + } + } + LOG(INFO)<<"Detections second STrack list size: "< r_tracked_stracks; + for(int i:u_track){ + if(strack_pool[i]->state() == bytetrack::BaseTrack::TrackState::TRACKED){ + r_tracked_stracks.push_back(strack_pool[i]); + } + } + + dists = BuildIoUCostMatrix(r_tracked_stracks,detections_second); + if(fuse_score_){ + dists = FuseScore(dists,detections_second); + } + + auto [matches2,u_track2,u_detection_second] = bytetrack::LinearAssignment(dists,0.5f); + + LOG(INFO)<<"Second step association, " + <<" matches2 size: "<state() == bytetrack::BaseTrack::TrackState::TRACKED){ + // track->Update(det,frame_id_); + // activated_stracks.push_back(*track); + // }else{ + // track->ReActivate(det,frame_id_,false); + // refind_stracks.push_back(*track); + // } + // } + + for (int k = 0; k < matches2.rows(); ++k) { + int itracked = matches2(k, 0); + int idet = matches2(k, 1); + auto* track = r_tracked_stracks[itracked]; + auto det = detections_second[idet]; + if(track->state() == bytetrack::BaseTrack::TrackState::TRACKED){ + track->Update(det,frame_id_); + activated_stracks.push_back(*track); + }else{ + track->ReActivate(det,frame_id_,false); + refind_stracks.push_back(*track); + } + } + + // mark lost tracks + for(int it:u_track2){ + auto* track = r_tracked_stracks[it]; + if(track->state() != bytetrack::BaseTrack::TrackState::LOST){ + track->MarkLost(); + lost_stracks.push_back(*track); + } + } + LOG(INFO) << " After 2nd assoc: lost=" << lost_stracks.size(); + /////////////////////// DEAL W UNCONFIRMED TRACKS /////////////////////////// + std::vector detections_uc; + for(int i : u_detection){ + detections_uc.push_back(detections[i]); + } + + dists = BuildIoUCostMatrix(unconfirmed,detections_uc); + if(fuse_score_){ + dists = FuseScore(dists,detections_uc); + } + + auto [matches3,u_unconfirmed,u_detection_3] = bytetrack::LinearAssignment(dists,0.7f); + + for (int k = 0; k < matches3.rows(); ++k) { + int itracked = matches3(k, 0); + int idet = matches3(k, 1); + unconfirmed[itracked]->Update(detections_uc[idet],frame_id_); + activated_stracks.push_back(*unconfirmed[itracked]); + } + + for(int it: u_unconfirmed){ + auto *track = unconfirmed[it]; + track->MarkRemoved(); + } + /////////////////////// INITIALIZE NEW TRACKS ///////////////////////////// + for(int inew:u_detection_3){ + auto track = detections_uc[inew]; + if(track.score() < new_track_thresh_){ + continue; + } + track.Activate(&kalman_filter_,frame_id_); + activated_stracks.push_back(track); + } + LOG(INFO) << " After unconfirmed+new: activated=" << activated_stracks.size(); + // printTrackStates(activated_stracks); + /////////////////////// UPDATE STATE ///////////////////////////// + for(auto& track:lost_stracks_){ + LOG(INFO)<<"Time diff update state "<max_time_lost_){ + track.MarkRemoved(); + removed_stracks.push_back(track); + } + } + + LOG(INFO)<<"actiavted_stracks size: "< only_tracked; + for (auto& t : tracked_stracks_) { + if (t.state() == bytetrack::BaseTrack::TrackState::TRACKED) + only_tracked.push_back(t); + } + tracked_stracks_ = only_tracked; + } + + // joint_stracks(tracked_stracks_, activated_stracks) + { + std::vector cur_ptrs, act_ptrs, ref_ptrs; + for (auto& t : tracked_stracks_) cur_ptrs.push_back(&t); + for (auto& t : activated_stracks) act_ptrs.push_back(&t); + for (auto& t : refind_stracks) ref_ptrs.push_back(&t); + + auto joined = JointStracks(cur_ptrs, act_ptrs); + // joint_stracks(tracked_stracks_, refind_stracks) + joined = JointStracks(joined, ref_ptrs); + + std::vector joined_tracks; + joined_tracks.reserve(joined.size()); + for (auto* t : joined) joined_tracks.push_back(*t); + tracked_stracks_ = std::move(joined_tracks); + } + + // sub_stracks(lost_stracks_, tracked_stracks_) then extend with lost_stracks (local) + { + std::vector lost_ptrs2, new_tracked_ptrs, removed_ptrs, local_lost_ptrs; + for (auto& t : lost_stracks_) lost_ptrs2.push_back(&t); + for (auto& t : tracked_stracks_) new_tracked_ptrs.push_back(&t); // fresh pointers! + for (auto& t : removed_stracks_) removed_ptrs.push_back(&t); + for (auto& t : lost_stracks) local_lost_ptrs.push_back(&t); + + auto new_lost = SubStracks(lost_ptrs2, new_tracked_ptrs); + new_lost = JointStracks(new_lost, local_lost_ptrs); // extend + new_lost = SubStracks(new_lost, removed_ptrs); + + // lost_stracks_.clear(); + // for (auto* t : new_lost) lost_stracks_.push_back(*t); + std::vector l_stracks; + l_stracks.reserve(new_lost.size()); + for (auto* t : new_lost) l_stracks.push_back(*t); + lost_stracks_ = std::move(l_stracks); + } + + // extend removed_stracks_ + for (auto& t : removed_stracks) removed_stracks_.push_back(t); + + // remove duplicates + { + std::vector tracked_ptrs2, lost_ptrs3; + for (auto& t : tracked_stracks_) tracked_ptrs2.push_back(&t); + for (auto& t : lost_stracks_) lost_ptrs3.push_back(&t); + auto [dedup_tracked, dedup_lost] = RemoveDuplicateStracks(tracked_ptrs2, lost_ptrs3); + std::vector new_tracked, new_lost; + new_tracked.reserve(dedup_tracked.size()); + new_lost.reserve(dedup_lost.size()); + for (auto* t : dedup_tracked) new_tracked.push_back(*t); + for (auto* t : dedup_lost) new_lost.push_back(*t); + tracked_stracks_ = std::move(new_tracked); + lost_stracks_ = std::move(new_lost); + } + + LOG(INFO) << "After update state"; + LOG(INFO) << " End of frame: tracked=" << tracked_stracks_.size() + << " lost=" << lost_stracks_.size(); + + + ////////////////////////////// OUTPUT /////////////////////////////////// + auto output = std::make_unique>(); + for(const auto& t : tracked_stracks_){ + if(!t.is_activated()) continue; + LOG(INFO) << "Track ID: " << t.track_id() << " is_activated: " << t.is_activated(); + Detection d; + d.set_detection_id(t.track_id()); + d.add_label(t.label()); + d.add_score(t.score()); + auto* loc = d.mutable_location_data(); + loc->set_format(LocationData::RELATIVE_BOUNDING_BOX); + auto* rb = loc->mutable_relative_bounding_box(); + Eigen::Vector4f box = t.tlwh(); + rb->set_xmin(box(0)); + rb->set_ymin(box(1)); + rb->set_width(box(2)); + rb->set_height(box(3)); + output->push_back(d); + } + LOG(INFO) << "After building detections"; + LOG(INFO) << " Output size: " << output->size(); + // cc->Outputs().Get("DETECTIONS",0).Add( + // output.release(), cc->InputTimestamp()); + cc->Outputs().Get("DETECTIONS", 0).Add( + output.release(), cc->Inputs().Get("DETECTIONS", 0).Value().Timestamp()); + return absl::OkStatus(); + + } + + std::vector ByteTrackCalculator::JointStracks(std::vector& a,std::vector& b){ + std::unordered_map exists; + std::vector res; + + for(auto *t:a){ + exists[t->track_id()] = true; + res.push_back(t); + } + + for(auto *t:b){ + int tid = t->track_id(); + if (exists.find(tid) == exists.end()){ + exists[tid] = true; + res.push_back(t); + } + } + return res; + } + + std::vector ByteTrackCalculator::SubStracks(std::vector& a,std::vector& b){ + std::unordered_map exists; + std::vector res; + for(auto *t:a){ + exists[t->track_id()] = t; + } + for(auto *t:b){ + int tid = t->track_id(); + if (exists.find(tid) != exists.end()){ + exists.erase(tid); + } + } + for(auto& i:exists){ + res.push_back(i.second); + } + return res; + } + + //// WIP + std::pair, std::vector> + ByteTrackCalculator::RemoveDuplicateStracks(std::vector& a,std::vector& b){ + + auto pdist = BuildIoUCostMatrix(a, b); + + std::vector dupa, dupb; + for (int p = 0; p < (int)a.size(); ++p) { + for (int q = 0; q < (int)b.size(); ++q) { + if (pdist(p, q) < 0.15f) { // high overlap — duplicate + int timep = a[p]->frame_id() - a[p]->start_frame(); + int timeq = b[q]->frame_id() - b[q]->start_frame(); + if (timep > timeq) + dupb.push_back(q); + else + dupa.push_back(p); + } + } + } + + std::vector resa, resb; + for (int i = 0; i < (int)a.size(); ++i) + if (std::find(dupa.begin(), dupa.end(), i) == dupa.end()) + resa.push_back(a[i]); + for (int i = 0; i < (int)b.size(); ++i) + if (std::find(dupb.begin(), dupb.end(), i) == dupb.end()) + resb.push_back(b[i]); + + return {resa, resb}; + + } + + + +} diff --git a/mediapipe/graphs/bytetrack/calculators/bytetrack_calculator.h b/mediapipe/graphs/bytetrack/calculators/bytetrack_calculator.h new file mode 100644 index 00000000000..8c0cee380ad --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/bytetrack_calculator.h @@ -0,0 +1,53 @@ +#ifndef MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_BYTETRACK_CALCULATOR_H_ +#define MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_BYTETRACK_CALCULATOR_H_ + +#include "mediapipe/graphs/bytetrack/calculators/bytetrack_calculator.pb.h" + +#include "mediapipe/framework/calculator_framework.h" +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/graphs/bytetrack/calculators/strack.h" +#include "mediapipe/graphs/bytetrack/calculators/kalman_filter.h" + +namespace mediapipe{ + class ByteTrackCalculator : public CalculatorBase{ + public: + static absl::Status GetContract(CalculatorContract* cc); + absl::Status Open(CalculatorContext* cc) override; + absl::Status Process(CalculatorContext* cc) override; + private: + ::mediapipe::ByteTrackCalculatorOptions options_; + std::vector tracked_stracks_; + std::vector lost_stracks_; + std::vector removed_stracks_; + + int track_buffer_; + float det_thresh_; + float match_thresh_; + float track_high_thresh_; + float track_low_thresh_; + float new_track_thresh_; + bool fuse_score_; + int frame_id_ = 0; + int max_time_lost_ = 30; + int64_t last_timestamp_ = -1; + float estimated_fps_ = 30.0f; + + bytetrack::KalmanFilter kalman_filter_; + + static std::vector JointStracks( + std::vector& a, + std::vector& b); + + static std::vector SubStracks( + std::vector& a, + std::vector& b); + + static std::pair, std::vector> + RemoveDuplicateStracks( + std::vector& a, + std::vector& b); + + }; +} + +#endif \ No newline at end of file diff --git a/mediapipe/graphs/bytetrack/calculators/bytetrack_calculator.proto b/mediapipe/graphs/bytetrack/calculators/bytetrack_calculator.proto new file mode 100644 index 00000000000..7d91b63caf2 --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/bytetrack_calculator.proto @@ -0,0 +1,17 @@ +syntax = "proto2"; + +package mediapipe; + +import "mediapipe/framework/calculator.proto"; + +message ByteTrackCalculatorOptions { + extend mediapipe.CalculatorOptions { + optional ByteTrackCalculatorOptions ext = 247258241; + } + optional float matching_threshold = 2 [default = 0.8]; + optional int32 track_buffer = 3 [default = 30]; + optional float track_high_threshold = 4 [default = 0.25]; + optional float track_low_threshold = 5 [default = 0.1]; + optional float new_track_threshold = 6 [default = 0.25]; + optional bool fuse_score = 7 [default = true]; +} diff --git a/mediapipe/graphs/bytetrack/calculators/kalman_filter.cc b/mediapipe/graphs/bytetrack/calculators/kalman_filter.cc new file mode 100644 index 00000000000..c93468f2315 --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/kalman_filter.cc @@ -0,0 +1,106 @@ +#include "mediapipe/graphs/bytetrack/calculators/kalman_matrices.h" + +#include + +#include + +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/graphs/bytetrack/calculators/kalman_filter.h" + + +namespace mediapipe{ + namespace bytetrack{ + + KalmanFilter::KalmanFilter(): motion_mat_(MakeMotionMatrix()), + update_mat_(MakeUpdateMatrix()), + kStdWeightPos_(kStdWeightPos), + kStdWeightVel_(kStdWeightVel) {} + + std::pair KalmanFilter::Initiate(const Eigen::Vector4f detection){ + Mean mean; + mean << detection(0), detection(1), detection(2), detection(3), 0.f, 0.f, 0.f, 0.f; + + Eigen::Matrix std_vals; + std_vals << 2.f * kStdWeightPos_ * detection(3), + 2.f * kStdWeightPos_ * detection(3), + 1e-2f, + 2.f * kStdWeightPos_ * detection(3), + 10.f * kStdWeightVel_ * detection(3), + 10.f * kStdWeightVel_ * detection(3), + 1e-5f, + 10.f * kStdWeightVel_ * detection(3); + + Cov cov = std_vals.array().square().matrix().asDiagonal(); + return {mean, cov}; + } + + std::pair KalmanFilter::Predict(const Mean& mean, const Cov& cov) const { + const float h = mean(3); + Eigen::Matrix std_pv; + std_pv << kStdWeightPos_ * h, kStdWeightPos_ * h, 1e-2f, kStdWeightPos_ * h, + kStdWeightVel_ * h, kStdWeightVel_ * h, 1e-5f, kStdWeightVel_ * h; + const Cov motion_cov = std_pv.array().square().matrix().asDiagonal(); + const Mean est_mean = mean * motion_mat_.transpose(); + const Cov est_cov = motion_mat_ * cov * motion_mat_.transpose() + motion_cov; + + return {est_mean, est_cov}; + } + + std::pair KalmanFilter::Update(const Mean& mean, const Cov& cov,const Eigen::Vector4f xyah) const{ + const float x_c = xyah(0); + const float y_c = xyah(1); + const float ar = xyah(2); + const float h = xyah(3); + + Eigen::Matrix measurement; + measurement << x_c, y_c, ar, h; + // Innovation covariance in measurement space + Eigen::Matrix noise_std; + noise_std << kStdWeightPos_ * mean(3), + kStdWeightPos_ * mean(3), + 1e-1f, + kStdWeightPos_ * mean(3); + const Eigen::Matrix innov_cov = noise_std.array().square().matrix().asDiagonal(); + + const Eigen::Matrix proj_mean = mean * update_mat_.transpose(); + const Eigen::Matrix proj_cov = update_mat_ * cov * update_mat_.transpose() + innov_cov; + + // Kalman gain via Cholesky solve: K = (P H^T) (H P H^T + R)^{-1} + const Eigen::Matrix PHt = cov * update_mat_.transpose(); + const Eigen::Matrix K = proj_cov.llt().solve(PHt.transpose()).transpose(); + + const Mean updated_mean = mean + (measurement - proj_mean) * K.transpose(); + const Cov updated_cov = cov - K * proj_cov * K.transpose(); + + return {updated_mean, updated_cov}; + + } + + std::pair KalmanFilter::MultiPredict(const MeanMatrix& means, const CovMatrix& covs) const { + const int N = means.rows(); + + // Build Nx8 std matrix — each row is the std devs for one track + MeanMatrix std_mat(N, 8); + for (int i = 0; i < N; ++i) { + const float h = means(i, 3); + std_mat.row(i) << kStdWeightPos_ * h, kStdWeightPos_ * h, 1e-2f, kStdWeightPos_ * h, + kStdWeightVel_ * h, kStdWeightVel_ * h, 1e-5f, kStdWeightVel_ * h; + } + + // Predicted means: (N,8) @ F^T + MeanMatrix pred_means = means * motion_mat_.transpose(); + + // Predicted covariances per track + CovMatrix pred_covs(N); + for (int i = 0; i < N; ++i) { + const Cov motion_cov = std_mat.row(i) + .array().square() + .matrix().asDiagonal(); + pred_covs[i] = motion_mat_ * covs[i] * motion_mat_.transpose()+ motion_cov; + } + + return {pred_means, pred_covs}; + } + + } +} diff --git a/mediapipe/graphs/bytetrack/calculators/kalman_filter.h b/mediapipe/graphs/bytetrack/calculators/kalman_filter.h new file mode 100644 index 00000000000..e0e6345bf35 --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/kalman_filter.h @@ -0,0 +1,33 @@ +#ifndef MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_KALMAN_FILTER_H_ +#define MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_KALMAN_FILTER_H_ +#include + +#include + +#include "mediapipe/framework/formats/detection.pb.h" + +namespace mediapipe { +namespace bytetrack { + +using Mean = Eigen::Matrix; +using Cov = Eigen::Matrix; +using MeanMatrix = Eigen::Matrix; +using CovMatrix = std::vector; +class KalmanFilter { +public: + KalmanFilter(); + std::pair Initiate(const Eigen::Vector4f detection); + std::pair Predict(const Mean& mean, const Cov& cov) const; + std::pair Update(const Mean& mean, const Cov& cov,const Eigen::Vector4f xyah) const; + std::pair MultiPredict(const MeanMatrix& means, const CovMatrix& covs) const; +private: + Eigen::Matrix motion_mat_; + Eigen::Matrix update_mat_; + float kStdWeightPos_; + float kStdWeightVel_; +}; + +} // namespace bytetrack +} // namespace mediapipe + +#endif \ No newline at end of file diff --git a/mediapipe/graphs/bytetrack/calculators/kalman_matrices.h b/mediapipe/graphs/bytetrack/calculators/kalman_matrices.h new file mode 100644 index 00000000000..428e896bdcd --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/kalman_matrices.h @@ -0,0 +1,28 @@ +// kalman_matrices.h +#ifndef MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_KALMAN_MATRICES_H_ +#define MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_KALMAN_MATRICES_H_ + +#include + +namespace mediapipe { +namespace bytetrack { + +static constexpr float kStdWeightPos = 1.0f / 20.0f; +static constexpr float kStdWeightVel = 1.0f / 160.0f; + +inline Eigen::Matrix MakeMotionMatrix() { + Eigen::Matrix F = Eigen::Matrix::Identity(); + F.block<4, 4>(0, 4) = Eigen::Matrix4f::Identity(); + return F; +} + +inline Eigen::Matrix MakeUpdateMatrix() { + Eigen::Matrix H = Eigen::Matrix::Zero(); + H.block<4, 4>(0, 0) = Eigen::Matrix4f::Identity(); + return H; +} + +} // namespace bytetrack +} // namespace mediapipe + +#endif \ No newline at end of file diff --git a/mediapipe/graphs/bytetrack/calculators/matching_utils.h b/mediapipe/graphs/bytetrack/calculators/matching_utils.h new file mode 100644 index 00000000000..8d222e0f650 --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/matching_utils.h @@ -0,0 +1,312 @@ +#ifndef MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_MATCHING_UTILS_H_ +#define MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_MATCHING_UTILS_H_ + +#include +#include +#include + +#include + +#include "mediapipe/graphs/bytetrack/calculators/strack.h" + +namespace mediapipe { +namespace bytetrack { + + +struct AssignmentResult { + Eigen::MatrixXi matches; // shape (K, 2) — col 0 = track idx, col 1 = box idx + Eigen::VectorXi unmatched_tracks; // shape (P,) + Eigen::VectorXi unmatched_boxes; // shape (Q,) +}; + +// IoU between two [top, left, bottom, right] boxes +inline float ComputeIoU(const Eigen::Vector4f& a, + const Eigen::Vector4f& b) { + float inter_x1 = std::max(a[0], b[0]); // left + float inter_y1 = std::max(a[1], b[1]); // top + float inter_x2 = std::min(a[2], b[2]); // right + float inter_y2 = std::min(a[3], b[3]); // bottom + + float inter_w = std::max(0.f, inter_x2 - inter_x1); + float inter_h = std::max(0.f, inter_y2 - inter_y1); + float inter_area = inter_w * inter_h; + if (inter_area == 0.f) return 0.f; + + float area_a = (a[2]-a[0]) * (a[3]-a[1]); // w * h + float area_b = (b[2]-b[0]) * (b[3]-b[1]); + return inter_area / (area_a + area_b - inter_area); +} + +inline Eigen::MatrixXf BuildIoUCostMatrix( + const std::vector& tracks, + const std::vector& detections) +{ + int N = tracks.size(); + int M = detections.size(); + + Eigen::MatrixXf cost(N, M); + + for (int i = 0; i < N; ++i) { + auto tb = tracks[i]->tlbr(); + for (int j = 0; j < M; ++j) { + auto bb = detections[j].tlbr(); + cost(i, j) = 1.f - ComputeIoU(tb, bb); + } + } + + return cost; +} + +inline Eigen::MatrixXf FuseScore( + const Eigen::MatrixXf cost_matrix, + const std::vector& detections) +{ + if(cost_matrix.size() == 0){ + return cost_matrix; + } + Eigen::MatrixXf iou_sim = 1.0f - cost_matrix.array(); + + Eigen::RowVectorXf det_scores(detections.size()); + for(int i = 0; i < (int)detections.size(); i++){ + det_scores(i) = detections[i].score(); + } + + Eigen::MatrixXf det_scores_mat = det_scores.replicate(cost_matrix.rows(), 1); + Eigen::MatrixXf fuse_sim = iou_sim.array() * det_scores_mat.array(); + Eigen::MatrixXf fuse_cost = 1.0f - fuse_sim.array(); + + return fuse_cost; + +} + +inline Eigen::MatrixXf BuildIoUCostMatrix( + const std::vector& a, + const std::vector& b) +{ + int N = a.size(); + int M = b.size(); + + Eigen::MatrixXf cost(N, M); + + for (int i = 0; i < N; ++i) { + auto ta = a[i]->tlbr(); + for (int j = 0; j < M; ++j) { + auto tb = b[j]->tlbr(); + cost(i, j) = 1.f - ComputeIoU(ta, tb); + } + } + + return cost; +} + + +// inline AssignmentResult LinearAssignment(const Eigen::MatrixXf& cost, float thresh){ +// int N = (int)cost.rows(); +// int M = (int)cost.cols(); + +// // Collect candidates below threshold +// std::vector> entries; +// entries.reserve(N * M); +// for (int i = 0; i < N; ++i) +// for (int j = 0; j < M; ++j) +// if (cost(i,j) <= thresh) +// entries.emplace_back(cost(i,j), i, j); + +// std::sort(entries.begin(), entries.end()); + +// // Greedy assignment +// std::vector track_used(N, false); +// std::vector box_used(M, false); +// std::vector> matched; +// matched.reserve(std::min(N, M)); + +// for (auto& [c, i, j] : entries) { +// if (!track_used[i] && !box_used[j]) { +// matched.emplace_back(i, j); +// track_used[i] = true; +// box_used[j] = true; +// } +// } + +// // Pack into Eigen outputs +// int K = (int)matched.size(); +// Eigen::MatrixXi matches(K, 2); // (K,2) — mirrors np.empty((0,2)) when K=0 +// for (int k = 0; k < K; ++k) { +// matches(k, 0) = matched[k].first; +// matches(k, 1) = matched[k].second; +// } + +// // Count unmatched first, then fill — avoids push_back on Eigen vectors +// int n_ut = (int)std::count(track_used.begin(), track_used.end(), false); +// int n_ub = (int)std::count(box_used.begin(), box_used.end(), false); + +// Eigen::VectorXi unmatched_tracks(n_ut); +// Eigen::VectorXi unmatched_boxes(n_ub); + +// for (int i = 0, k = 0; i < N; ++i) +// if (!track_used[i]) unmatched_tracks(k++) = i; +// for (int j = 0, k = 0; j < M; ++j) +// if (!box_used[j]) unmatched_boxes(k++) = j; + +// return {matches, unmatched_tracks, unmatched_boxes}; +// } + +inline AssignmentResult LinearAssignment(const Eigen::MatrixXf& cost, float thresh){ + int N = (int)cost.rows(); + int M = (int)cost.cols(); + + // Empty matrix early exit — mirrors Python: if cost_matrix.size == 0 + if (N == 0 || M == 0) { + Eigen::MatrixXi matches(0, 2); + Eigen::VectorXi u_tracks(N), u_boxes(M); + for (int i = 0; i < N; ++i) u_tracks(i) = i; + for (int j = 0; j < M; ++j) u_boxes(j) = j; + return {matches, u_tracks, u_boxes}; + } + + // Pad to square S x S — mirrors lap.lapjv extend_cost=True + int S = std::max(N, M); + const float INF = 1e9f; + + Eigen::MatrixXf cost_sq = Eigen::MatrixXf::Constant(S, S, INF); + for (int i = 0; i < N; ++i) + for (int j = 0; j < M; ++j) + cost_sq(i, j) = cost(i, j); + + // Dual variables and assignment vectors + std::vector u(S, 0.f), v(S, 0.f); + std::vector row2col(S, -1), col2row(S, -1); + + // Phase 1: Column reduction — init v[j] to column minimum + for (int j = 0; j < S; ++j) { + int best_i = 0; + float best_v = cost_sq(0, j); + for (int i = 1; i < S; ++i) { + if (cost_sq(i, j) < best_v) { best_v = cost_sq(i, j); best_i = i; } + } + v[j] = best_v; + if (row2col[best_i] == -1) { + row2col[best_i] = j; + col2row[j] = best_i; + } + } + + // Phase 2: Augmenting row reduction (2 passes) + for (int pass = 0; pass < 2; ++pass) { + for (int i = 0; i < S; ++i) { + if (row2col[i] != -1) continue; + int j1 = -1, j2 = -1; + float u1 = INF, u2 = INF; + for (int j = 0; j < S; ++j) { + float h = cost_sq(i, j) - v[j]; + if (h < u2) { + if (h < u1) { u2 = u1; j2 = j1; u1 = h; j1 = j; } + else { u2 = h; j2 = j; } + } + } + u[i] = u1; + if (col2row[j1] == -1) { + row2col[i] = j1; + col2row[j1] = i; + } else { + v[j1] -= (u2 - u1); + } + } + } + + // Phase 3: Augmentation via shortest path (Dijkstra with potentials) + std::vector dist(S); + std::vector pred(S, -1); + std::vector visited(S, false); + + for (int i_start = 0; i_start < S; ++i_start) { + if (row2col[i_start] != -1) continue; + + std::fill(dist.begin(), dist.end(), INF); + std::fill(pred.begin(), pred.end(), -1); + std::fill(visited.begin(), visited.end(), false); + + for (int j = 0; j < S; ++j) + dist[j] = cost_sq(i_start, j) - u[i_start] - v[j]; + + int j_end = -1; + float d_min = INF; + + for (int iter = 0; iter < S; ++iter) { + // Pick unvisited col with smallest dist + int j_min = -1; + d_min = INF; + for (int j = 0; j < S; ++j) + if (!visited[j] && dist[j] < d_min) { d_min = dist[j]; j_min = j; } + + if (j_min == -1) break; + visited[j_min] = true; + + if (col2row[j_min] == -1) { j_end = j_min; break; } + + // Relax edges through the row that owns j_min + int i_next = col2row[j_min]; + u[i_next] = cost_sq(i_next, j_min) - v[j_min] - d_min; // update dual + for (int j = 0; j < S; ++j) { + if (visited[j]) continue; + float nd = d_min + cost_sq(i_next, j) - u[i_next] - v[j]; + if (nd < dist[j]) { dist[j] = nd; pred[j] = j_min; } + } + } + + // Update col duals along the path + for (int j = 0; j < S; ++j) + if (visited[j]) v[j] += dist[j] - d_min; + u[i_start] += d_min; + + // Augment: flip assignments along path back to i_start + int j_cur = j_end; + while (j_cur != -1) { + int i_cur = (pred[j_cur] == -1) ? i_start : col2row[pred[j_cur]]; + col2row[j_cur] = i_cur; + row2col[i_cur] = j_cur; + j_cur = pred[j_cur]; + } + } + + // Extract matches — mirrors: for ix, mx in enumerate(x): if mx >= 0 + // Apply cost_limit=thresh filter here + std::vector track_used(N, false); + std::vector box_used(M, false); + std::vector> matched; + + for (int i = 0; i < N; ++i) { + int j = row2col[i]; + if (j < M && cost(i, j) <= thresh) { + matched.emplace_back(i, j); + track_used[i] = true; + box_used[j] = true; + } + } + + // Pack into AssignmentResult + int K = (int)matched.size(); + Eigen::MatrixXi matches(K, 2); + for (int k = 0; k < K; ++k) { + matches(k, 0) = matched[k].first; + matches(k, 1) = matched[k].second; + } + + int n_ut = (int)std::count(track_used.begin(), track_used.end(), false); + int n_ub = (int)std::count(box_used.begin(), box_used.end(), false); + + Eigen::VectorXi unmatched_tracks(n_ut); + Eigen::VectorXi unmatched_boxes(n_ub); + + for (int i = 0, k = 0; i < N; ++i) + if (!track_used[i]) unmatched_tracks(k++) = i; + for (int j = 0, k = 0; j < M; ++j) + if (!box_used[j]) unmatched_boxes(k++) = j; + + return {matches, unmatched_tracks, unmatched_boxes}; +} + +} // namespace bytetrack +} // namespace mediapipe + +#endif \ No newline at end of file diff --git a/mediapipe/graphs/bytetrack/calculators/render_data_passthrough_calculator.cc b/mediapipe/graphs/bytetrack/calculators/render_data_passthrough_calculator.cc new file mode 100644 index 00000000000..bf03ac23c40 --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/render_data_passthrough_calculator.cc @@ -0,0 +1,33 @@ +#include "mediapipe/framework/calculator_framework.h" + +#include "mediapipe/util/render_data.pb.h" + +namespace mediapipe { + +class PassThroughRenderDataCalculator : public CalculatorBase { + public: + static absl::Status GetContract(CalculatorContract* cc) { + cc->Inputs().Tag("RENDER_DATA").Set(); + cc->Outputs().Tag("RENDER_DATA").Set(); + return absl::OkStatus(); + } + + absl::Status Process(CalculatorContext* cc) override { + const auto& render_data = + cc->Inputs().Tag("RENDER_DATA").Get(); + + LOG(INFO) << "RenderData: num objects = " + << render_data.render_annotations_size(); + + // Forward unchanged + cc->Outputs().Tag("RENDER_DATA").AddPacket( + MakePacket(render_data) + .At(cc->InputTimestamp())); + + return absl::OkStatus(); + } +}; + +REGISTER_CALCULATOR(PassThroughRenderDataCalculator); + +} // namespace mediapipe \ No newline at end of file diff --git a/mediapipe/graphs/bytetrack/calculators/strack.cc b/mediapipe/graphs/bytetrack/calculators/strack.cc new file mode 100644 index 00000000000..7f22675245c --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/strack.cc @@ -0,0 +1,148 @@ +#include "mediapipe/graphs/bytetrack/calculators/basetrack.h" + +#include + +#include + +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/graphs/bytetrack/calculators/kalman_filter.h" +#include "mediapipe/graphs/bytetrack/calculators/kalman_matrices.h" +#include "mediapipe/graphs/bytetrack/calculators/strack.h" + +namespace mediapipe{ +namespace bytetrack{ + + KalmanFilter STrack::shared_kalman; + + STrack::STrack(const Detection& det){ + const auto &loc = det.location_data(); + score_ = det.score_size() > 0 ? det.score(0) : 0.0f; + label_ = det.label_size() > 0 ? det.label(0) : ""; + if(loc.format() == LocationData::RELATIVE_BOUNDING_BOX){ + const auto& rb = loc.relative_bounding_box(); + tlwh_ << rb.xmin(), rb.ymin(), rb.width(), rb.height(); + }else if(loc.format() == LocationData::BOUNDING_BOX){ + const auto& b = loc.bounding_box(); + tlwh_ << b.xmin(), b.ymin(), b.width(), b.height(); + }else{ + tlwh_.setZero(); + } + } + + void STrack::Predict(){ + Mean mean_state = mean(); + if(state() != TrackState::TRACKED){ + mean_state(7) = 0.f; + } + auto [new_mean,new_cov] = kf_->Predict(mean_state,cov()); + mean_ = new_mean; + cov_ = new_cov; + } + + void STrack::Activate(KalmanFilter* kalman_filter,int frame_id){ + kf_ = kalman_filter; + track_id_ = next_id(); + auto [new_mean, new_cov] = kf_->Initiate(TlwhToXyah(tlwh_)); + mean_ = new_mean; + cov_ = new_cov; + + tracklet_len_ = 0; + state_ = TrackState::TRACKED; + if(frame_id == 1) is_activated_ = true; + frame_id_ = frame_id; + start_frame_ = frame_id; + } + + void STrack::ReActivate(const STrack& new_track, int frame_id, bool new_id){ + Eigen::Vector4f new_tlwh = new_track.tlwh_; + auto [new_mean,new_cov] = kf_->Update(mean_,cov_,TlwhToXyah(new_tlwh)); + mean_ = new_mean; + cov_ = new_cov; + tracklet_len_ = 0; + state_ = TrackState::TRACKED; + is_activated_ = true; + + frame_id_ = frame_id; + if(new_id) track_id_ = next_id(); + score_ = new_track.score(); + } + + void STrack::Update(const STrack& new_track, int frame_id){ + frame_id_ = frame_id; + tracklet_len_+=1; + + Eigen::Vector4f new_tlwh = new_track.tlwh_; + auto [new_mean,new_cov] = kf_->Update(mean_,cov_,TlwhToXyah(new_tlwh)); + mean_ = new_mean; + cov_ = new_cov; + + state_ = TrackState::TRACKED; + is_activated_ = true; + score_ = new_track.score(); + } + + void STrack::MultiPredict(std::vector& tracks){ + long int n = tracks.size(); + if(n>0){ + MeanMatrix multi_mean(n,8); + for(int i=0;imean_; + } + CovMatrix multi_cov(n); + for(int i=0;icov_; + } + for(int i=0;istate_ != TrackState::TRACKED){ + multi_mean(i,7) = 0.f; + } + } + auto [updated_means, updated_covs] = shared_kalman.MultiPredict(multi_mean, multi_cov); + for(int i=0;imean_ = updated_means.row(i); + tracks[i]->cov_ = updated_covs[i]; + } + } + } + + + Eigen::Vector4f STrack::TlwhToXyah(const Eigen::Vector4f& tlwh){ + Eigen::Vector4f xyah; + float x = tlwh(0); + float y = tlwh(1); + float w = tlwh(2); + float h = tlwh(3); + + xyah(0) = x + w / 2.0f; + xyah(1) = y + h / 2.0f; + xyah(2) = w / h; + xyah(3) = h; + + return xyah; + } + + Eigen::Vector4f STrack::tlwh() const { + // Before activation — return the raw detection box + if (kf_ == nullptr) return tlwh_; + + // After activation — reconstruct from Kalman mean + // mean_ = [cx, cy, ar, h, vx, vy, var, vh] + Eigen::Vector4f ret; + ret(0) = mean_(0); + ret(1) = mean_(1); + ret(2) = mean_(2) * mean_(3); + ret(3) = mean_(3); + ret(0) -= ret(2) / 2.f; + ret(1) -= ret(3) / 2.f; + return ret; + } + + Eigen::Vector4f STrack::tlbr() const { + Eigen::Vector4f ret = tlwh(); + ret(2) += ret(0); + ret(3) += ret(1); + return ret; + } + +} +} \ No newline at end of file diff --git a/mediapipe/graphs/bytetrack/calculators/strack.h b/mediapipe/graphs/bytetrack/calculators/strack.h new file mode 100644 index 00000000000..883742d24c0 --- /dev/null +++ b/mediapipe/graphs/bytetrack/calculators/strack.h @@ -0,0 +1,60 @@ +#ifndef MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_STRACK_H_ +#define MEDIAPIPE_GRAPHS_BYTETRACK_CALCULATORS_STRACK_H_ + +#include +#include + +#include + +#include "mediapipe/framework/formats/detection.pb.h" +#include "mediapipe/graphs/bytetrack/calculators/kalman_filter.h" +#include "mediapipe/graphs/bytetrack/calculators/basetrack.h" + +namespace mediapipe { +namespace bytetrack { + +using Mean = Eigen::Matrix; +using Cov = Eigen::Matrix; +using MeanMatrix = Eigen::Matrix; +using CovMatrix = std::vector; + +class STrack : public BaseTrack { + public: + static KalmanFilter shared_kalman; + + explicit STrack(const Detection& det); + + void Activate(KalmanFilter* kalman_filter, int frame_id); + void ReActivate(const STrack& new_track, int frame_id, bool new_id = false); + void Update(const STrack& new_track, int frame_id); + + void Predict(); + static void MultiPredict(std::vector& tracks); + + Eigen::Vector4f tlwh() const; + Eigen::Vector4f tlbr() const; + // static Eigen::Vector4f TlbrToTlwh(const Eigen::Vector4f& tlbr); + // static Eigen::Vector4f TlwhToTlbr(const Eigen::Vector4f& tlwh); + static Eigen::Vector4f TlwhToXyah(const Eigen::Vector4f& tlwh); + + // KalmanState ToProto() const; + int tracklet_len() const { return tracklet_len_; } + std::string label() const {return label_;} + const Mean& mean() const { return mean_; } + const Cov& cov() const { return cov_; } + + private: + // STrack-only — Kalman state and detection origin + // score_, is_activated_, tracklet_len_ are inherited from BaseTrack — do NOT redeclare + Eigen::Vector4f tlwh_; + std::string label_; + KalmanFilter* kf_ = nullptr; + Mean mean_ = Mean::Zero(); + Cov cov_ = Cov::Zero(); + int tracklet_len_ = 0; +}; + +} // namespace bytetrack +} // namespace mediapipe + +#endif \ No newline at end of file diff --git a/mediapipe/models/BUILD b/mediapipe/models/BUILD index 53a7c391695..fb0d455b792 100644 --- a/mediapipe/models/BUILD +++ b/mediapipe/models/BUILD @@ -21,6 +21,18 @@ licenses(["notice"]) package(default_visibility = ["//visibility:public"]) +filegroup( + name = "yolox_tiny_float32", + srcs = ["yolox_tiny_float32.tflite"], + visibility = ["//visibility:public"], +) + +filegroup( + name = "coco_labels", + srcs = ["coco_labels.txt"], + visibility = ["//visibility:public"], +) + mediapipe_files( srcs = [ "README.md", @@ -40,4 +52,3 @@ mediapipe_files( "ssdlite_object_detection_labelmap.txt", ], ) -