From f417c269d14d01c07ed99719e5e23c2d3bd32856 Mon Sep 17 00:00:00 2001
From: Alexander Eichhorn <alex@eichhorn.dev>
Date: Tue, 16 Dec 2025 15:58:48 +0100
Subject: [PATCH] fix(vae): Fix dtype mismatch in FP32 VAE decode mode

The previous mixed-precision optimization for FP32 mode only converted
some VAE decoder layers (post_quant_conv, conv_in, mid_block) to the
latents dtype while leaving others (up_blocks, conv_norm_out) in float32.
This caused "expected scalar type Half but found Float" errors after
recent diffusers updates.

Simplify FP32 mode to consistently use float32 for both VAE and latents,
removing the incomplete mixed-precision logic. This trades some VRAM
usage for stability and correctness.

Also removes now-unused attention processor imports.
---
 invokeai/app/invocations/latents_to_image.py | 27 ++------------------
 1 file changed, 2 insertions(+), 25 deletions(-)

diff --git a/invokeai/app/invocations/latents_to_image.py b/invokeai/app/invocations/latents_to_image.py
index ab1096caf7..608485a078 100644
--- a/invokeai/app/invocations/latents_to_image.py
+++ b/invokeai/app/invocations/latents_to_image.py
@@ -2,12 +2,6 @@ from contextlib import nullcontext
 
 import torch
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.models.attention_processor import (
-    AttnProcessor2_0,
-    LoRAAttnProcessor2_0,
-    LoRAXFormersAttnProcessor,
-    XFormersAttnProcessor,
-)
 from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
 from diffusers.models.autoencoders.autoencoder_tiny import AutoencoderTiny
 
@@ -77,26 +71,9 @@ class LatentsToImageInvocation(BaseInvocation, WithMetadata, WithBoard):
             assert isinstance(vae, (AutoencoderKL, AutoencoderTiny))
             latents = latents.to(TorchDevice.choose_torch_device())
             if self.fp32:
+                # FP32 mode: convert everything to float32 for maximum precision
                 vae.to(dtype=torch.float32)
-
-                use_torch_2_0_or_xformers = hasattr(vae.decoder, "mid_block") and isinstance(
-                    vae.decoder.mid_block.attentions[0].processor,
-                    (
-                        AttnProcessor2_0,
-                        XFormersAttnProcessor,
-                        LoRAXFormersAttnProcessor,
-                        LoRAAttnProcessor2_0,
-                    ),
-                )
-                # if xformers or torch_2_0 is used attention block does not need
-                # to be in float32 which can save lots of memory
-                if use_torch_2_0_or_xformers:
-                    vae.post_quant_conv.to(latents.dtype)
-                    vae.decoder.conv_in.to(latents.dtype)
-                    vae.decoder.mid_block.to(latents.dtype)
-                else:
-                    latents = latents.float()
-
+                latents = latents.float()
             else:
                 vae.to(dtype=torch.float16)
                 latents = latents.half()