fix: improve handling of VAE decode failures (#1222)

2026-03-02 21:19:48 +01:00 · 2026-02-09 16:29:41 +01:00 · 2026-02-09 16:29:41 +01:00 · aa0b899397
commit aa0b899397
parent 5e264372ce
4 changed files with 48 additions and 24 deletions
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@ -409,7 +409,7 @@ bool save_results(const SDCliParams& cli_params,
    auto write_image = [&](const fs::path& path, int idx) {
        const sd_image_t& img = results[idx];
        if (!img.data)
-            return;
+            return false;

        std::string params = get_image_params(cli_params, ctx_params, gen_params, gen_params.seed + idx);
        int ok             = 0;
@ -419,8 +419,11 @@ bool save_results(const SDCliParams& cli_params,
            ok = stbi_write_png(path.string().c_str(), img.width, img.height, img.channel, img.data, 0, params.c_str());
        }
        LOG_INFO("save result image %d to '%s' (%s)", idx, path.string().c_str(), ok ? "success" : "failure");
+        return ok != 0;
    };

+    int sucessful_reults = 0;
+
    if (std::regex_search(cli_params.output_path, format_specifier_regex)) {
        if (!is_jpg && ext_lower != ".png")
            ext = ".png";
@ -429,9 +432,12 @@ bool save_results(const SDCliParams& cli_params,

        for (int i = 0; i < num_results; ++i) {
            fs::path img_path = format_frame_idx(pattern.string(), output_begin_idx + i);
-            write_image(img_path, i);
+            if (write_image(img_path, i)) {
+                sucessful_reults++;
+            }
        }
-        return true;
+        LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
+        return sucessful_reults != 0;
    }

    if (cli_params.mode == VID_GEN && num_results > 1) {
@ -439,9 +445,13 @@ bool save_results(const SDCliParams& cli_params,
            ext = ".avi";
        fs::path video_path = base_path;
        video_path += ext;
-        create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps);
-        LOG_INFO("save result MJPG AVI video to '%s'", video_path.string().c_str());
-        return true;
+        if (create_mjpg_avi_from_sd_images(video_path.string().c_str(), results, num_results, gen_params.fps) == 0) {
+            LOG_INFO("save result MJPG AVI video to '%s'", video_path.string().c_str());
+            return true;
+        } else {
+            LOG_ERROR("Failed to save result MPG AVI video to '%s'", video_path.string().c_str());
+            return false;
+        }
    }

    if (!is_jpg && ext_lower != ".png")
@ -453,10 +463,12 @@ bool save_results(const SDCliParams& cli_params,
            img_path += "_" + std::to_string(output_begin_idx + i);
        }
        img_path += ext;
-        write_image(img_path, i);
+        if (write_image(img_path, i)) {
+            sucessful_reults++;
+        }
    }
-
-    return true;
+    LOG_INFO("%d/%d images saved", sucessful_reults, num_results);
+    return sucessful_reults != 0;
 }

 int main(int argc, const char* argv[]) {
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -767,7 +767,7 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_silu_act(ggml_context* ctx, ggml_tensor*
    return x;
 }

-typedef std::function<void(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;
+typedef std::function<bool(ggml_tensor*, ggml_tensor*, bool)> on_tile_process;

 __STATIC_INLINE__ void sd_tiling_calc_tiles(int& num_tiles_dim,
                                            float& tile_overlap_factor_dim,
@ -918,12 +918,15 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input,

            int64_t t1 = ggml_time_ms();
            ggml_ext_tensor_split_2d(input, input_tile, x_in, y_in);
-            on_processing(input_tile, output_tile, false);
-            ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy);
+            if (on_processing(input_tile, output_tile, false)) {
+                ggml_ext_tensor_merge_2d(output_tile, output, x_out, y_out, overlap_x_out, overlap_y_out, dx, dy);

-            int64_t t2 = ggml_time_ms();
-            last_time  = (t2 - t1) / 1000.0f;
-            pretty_progress(tile_count, num_tiles, last_time);
+                int64_t t2 = ggml_time_ms();
+                last_time  = (t2 - t1) / 1000.0f;
+                pretty_progress(tile_count, num_tiles, last_time);
+            } else {
+                LOG_ERROR("Failed to process patch %d at (%d, %d)", tile_count, x, y);
+            }
            tile_count++;
        }
        last_x = false;
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -1558,7 +1558,7 @@ public:
                if (vae_tiling_params.enabled) {
                    // split latent in 32x32 tiles and compute in several steps
                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                        first_stage_model->compute(n_threads, in, true, &out, nullptr);
+                        return first_stage_model->compute(n_threads, in, true, &out, nullptr);
                    };
                    silent_tiling(latents, result, get_vae_scale_factor(), 32, 0.5f, on_tiling);

@ -1577,7 +1577,7 @@ public:
                if (vae_tiling_params.enabled) {
                    // split latent in 64x64 tiles and compute in several steps
                    auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                        tae_first_stage->compute(n_threads, in, true, &out, nullptr);
+                        return tae_first_stage->compute(n_threads, in, true, &out, nullptr);
                    };
                    silent_tiling(latents, result, get_vae_scale_factor(), 64, 0.5f, on_tiling);
                } else {
@ -2546,7 +2546,7 @@ public:
                LOG_DEBUG("VAE Tile size: %dx%d", tile_size_x, tile_size_y);

                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    first_stage_model->compute(n_threads, in, false, &out, work_ctx);
+                    return first_stage_model->compute(n_threads, in, false, &out, work_ctx);
                };
                sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
            } else {
@ -2557,7 +2557,7 @@ public:
            if (vae_tiling_params.enabled && !encode_video) {
                // split latent in 32x32 tiles and compute in several steps
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    tae_first_stage->compute(n_threads, in, false, &out, nullptr);
+                    return tae_first_stage->compute(n_threads, in, false, &out, nullptr);
                };
                sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
            } else {
@ -2675,11 +2675,15 @@ public:

                // split latent in 32x32 tiles and compute in several steps
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    first_stage_model->compute(n_threads, in, true, &out, nullptr);
+                    return first_stage_model->compute(n_threads, in, true, &out, nullptr);
                };
                sd_tiling_non_square(x, result, vae_scale_factor, tile_size_x, tile_size_y, tile_overlap, on_tiling);
            } else {
-                first_stage_model->compute(n_threads, x, true, &result, work_ctx);
+                if(!first_stage_model->compute(n_threads, x, true, &result, work_ctx)){
+                    LOG_ERROR("Failed to decode latetnts");
+                    first_stage_model->free_compute_buffer();
+                    return nullptr;
+                }
            }
            first_stage_model->free_compute_buffer();
            process_vae_output_tensor(result);
@ -2687,11 +2691,15 @@ public:
            if (vae_tiling_params.enabled) {
                // split latent in 64x64 tiles and compute in several steps
                auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-                    tae_first_stage->compute(n_threads, in, true, &out);
+                    return tae_first_stage->compute(n_threads, in, true, &out);
                };
                sd_tiling(x, result, vae_scale_factor, 64, 0.5f, on_tiling);
            } else {
-                tae_first_stage->compute(n_threads, x, true, &result);
+                if(!tae_first_stage->compute(n_threads, x, true, &result)){
+                    LOG_ERROR("Failed to decode latetnts");
+                    tae_first_stage->free_compute_buffer();
+                    return nullptr;
+                }
            }
            tae_first_stage->free_compute_buffer();
        }
@ -3461,6 +3469,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
        ggml_free(work_ctx);
        return nullptr;
    }
+    memset(result_images, 0, batch_count * sizeof(sd_image_t));

    for (size_t i = 0; i < decoded_images.size(); i++) {
        result_images[i].width   = width;
--- a/upscaler.cpp
+++ b/upscaler.cpp
@ -89,7 +89,7 @@ struct UpscalerGGML {

        ggml_tensor* upscaled = ggml_new_tensor_4d(upscale_ctx, GGML_TYPE_F32, output_width, output_height, 3, 1);
        auto on_tiling        = [&](ggml_tensor* in, ggml_tensor* out, bool init) {
-            esrgan_upscaler->compute(n_threads, in, &out);
+            return esrgan_upscaler->compute(n_threads, in, &out);
        };
        int64_t t0 = ggml_time_ms();
        sd_tiling(input_image_tensor, upscaled, esrgan_upscaler->scale, esrgan_upscaler->tile_size, 0.25f, on_tiling);