perf: speed up tensor_to_sd_image conversion

2026-05-01 11:11:30 +02:00 · 2026-04-30 01:03:35 +08:00 · 2026-04-30 01:03:35 +08:00 · b2a30b9a0e
commit b2a30b9a0e
parent b8079e253d
2 changed files with 71 additions and 25 deletions
--- a/src/preprocessing.hpp
+++ b/src/preprocessing.hpp
@ -24,6 +24,75 @@ static inline void preprocessing_set_4d(sd::Tensor<float>& tensor, float value,
    tensor.values()[static_cast<size_t>(preprocessing_offset_4d(tensor, i0, i1, i2, i3))] = value;
 }

+static inline uint8_t preprocessing_float_to_u8(float value) {
+    if (value <= 0.0f) {
+        return 0;
+    }
+    if (value >= 1.0f) {
+        return 255;
+    }
+    return static_cast<uint8_t>(value * 255.0f + 0.5f);
+}
+
+static inline void preprocessing_tensor_frame_to_sd_image(const sd::Tensor<float>& tensor, int frame_index, uint8_t* image_data) {
+    const auto& shape = tensor.shape();
+    GGML_ASSERT(shape.size() == 4 || shape.size() == 5);
+    GGML_ASSERT(image_data != nullptr);
+
+    const int width     = static_cast<int>(shape[0]);
+    const int height    = static_cast<int>(shape[1]);
+    const int channel   = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
+    const size_t pixels = static_cast<size_t>(width) * static_cast<size_t>(height);
+    const float* src    = tensor.data();
+
+    if (shape.size() == 4) {
+        GGML_ASSERT(frame_index >= 0 && frame_index < shape[3]);
+        const size_t frame_stride = pixels * static_cast<size_t>(channel);
+        const float* frame_ptr    = src + static_cast<size_t>(frame_index) * frame_stride;
+        if (channel == 3) {
+            const float* c0 = frame_ptr;
+            const float* c1 = frame_ptr + pixels;
+            const float* c2 = frame_ptr + pixels * 2;
+            for (size_t i = 0; i < pixels; ++i) {
+                image_data[i * 3 + 0] = preprocessing_float_to_u8(c0[i]);
+                image_data[i * 3 + 1] = preprocessing_float_to_u8(c1[i]);
+                image_data[i * 3 + 2] = preprocessing_float_to_u8(c2[i]);
+            }
+            return;
+        }
+
+        for (size_t i = 0; i < pixels; ++i) {
+            for (int c = 0; c < channel; ++c) {
+                image_data[i * static_cast<size_t>(channel) + static_cast<size_t>(c)] =
+                    preprocessing_float_to_u8(frame_ptr[i + pixels * static_cast<size_t>(c)]);
+            }
+        }
+        return;
+    }
+
+    GGML_ASSERT(frame_index >= 0 && frame_index < shape[2]);
+    const size_t channel_stride = pixels * static_cast<size_t>(shape[2]);
+    const float* frame_ptr      = src + static_cast<size_t>(frame_index) * pixels;
+    if (channel == 3) {
+        const float* c0 = frame_ptr;
+        const float* c1 = frame_ptr + channel_stride;
+        const float* c2 = frame_ptr + channel_stride * 2;
+        for (size_t i = 0; i < pixels; ++i) {
+            image_data[i * 3 + 0] = preprocessing_float_to_u8(c0[i]);
+            image_data[i * 3 + 1] = preprocessing_float_to_u8(c1[i]);
+            image_data[i * 3 + 2] = preprocessing_float_to_u8(c2[i]);
+        }
+        return;
+    }
+
+    for (size_t i = 0; i < pixels; ++i) {
+        for (int c = 0; c < channel; ++c) {
+            image_data[i * static_cast<size_t>(channel) + static_cast<size_t>(c)] =
+                preprocessing_float_to_u8(frame_ptr[i + channel_stride * static_cast<size_t>(c)]);
+        }
+    }
+}
+
 static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t image) {
    sd::Tensor<float> tensor({static_cast<int64_t>(image.width), static_cast<int64_t>(image.height), static_cast<int64_t>(image.channel), 1});
    for (uint32_t y = 0; y < image.height; ++y) {
@ -39,20 +108,7 @@ static inline sd::Tensor<float> sd_image_to_preprocessing_tensor(sd_image_t imag
 static inline void preprocessing_tensor_to_sd_image(const sd::Tensor<float>& tensor, uint8_t* image_data) {
    GGML_ASSERT(tensor.dim() == 4);
    GGML_ASSERT(tensor.shape()[3] == 1);
-    GGML_ASSERT(image_data != nullptr);
-
-    int width   = static_cast<int>(tensor.shape()[0]);
-    int height  = static_cast<int>(tensor.shape()[1]);
-    int channel = static_cast<int>(tensor.shape()[2]);
-    for (int y = 0; y < height; ++y) {
-        for (int x = 0; x < width; ++x) {
-            for (int c = 0; c < channel; ++c) {
-                float value                               = preprocessing_get_4d(tensor, x, y, c, 0);
-                value                                     = std::min(1.0f, std::max(0.0f, value));
-                image_data[(y * width + x) * channel + c] = static_cast<uint8_t>(std::round(value * 255.0f));
-            }
-        }
-    }
+    preprocessing_tensor_frame_to_sd_image(tensor, 0, image_data);
 }

 static inline sd::Tensor<float> gaussian_kernel_tensor(int kernel_size) {
--- a/src/util.cpp
+++ b/src/util.cpp
@ -505,17 +505,7 @@ sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index)
    int channel   = static_cast<int>(shape[shape.size() == 5 ? 3 : 2]);
    uint8_t* data = (uint8_t*)malloc(static_cast<size_t>(width * height * channel));
    GGML_ASSERT(data != nullptr);
-
-    for (int iw = 0; iw < width; ++iw) {
-        for (int ih = 0; ih < height; ++ih) {
-            for (int ic = 0; ic < channel; ++ic) {
-                float value                            = shape.size() == 5 ? tensor.index(iw, ih, frame_index, ic, 0)
-                                                                           : tensor.index(iw, ih, ic, frame_index);
-                value                                  = std::clamp(value, 0.0f, 1.0f);
-                data[(ih * width + iw) * channel + ic] = static_cast<uint8_t>(std::round(value * 255.0f));
-            }
-        }
-    }
+    preprocessing_tensor_frame_to_sd_image(tensor, frame_index, data);
    return {
        static_cast<uint32_t>(width),
        static_cast<uint32_t>(height),