feat(mm): siglip model loading supports partial loading

In the previous commit, the LLaVA model was updated to support partial loading.

In this commit, the SigLIP model is updated in the same way.

This model is used for FLUX Redux. It's <4GB and only ever run in isolation, so it won't benefit from partial loading for the vast majority of users. Regardless, I think it is best if we make _all_ models work with partial loading.

PS: I also fixed the initial load dtype issue, described in the prev commit. It's probably a non-issue for this model, but we may as well fix it.
This commit is contained in:
psychedelicious 2025-04-17 14:42:41 +10:00
parent c054501103
commit 814406d98a
4 changed files with 13 additions and 33 deletions

View File

@ -3,6 +3,7 @@ from typing import Literal, Optional
import torch
from PIL import Image
from transformers import SiglipImageProcessor, SiglipVisionModel
from invokeai.app.invocations.baseinvocation import (
BaseInvocation,
@ -115,8 +116,14 @@ class FluxReduxInvocation(BaseInvocation):
@torch.no_grad()
def _siglip_encode(self, context: InvocationContext, image: Image.Image) -> torch.Tensor:
siglip_model_config = self._get_siglip_model(context)
with context.models.load(siglip_model_config.key).model_on_device() as (_, siglip_pipeline):
assert isinstance(siglip_pipeline, SigLipPipeline)
with context.models.load(siglip_model_config.key).model_on_device() as (_, model):
assert isinstance(model, SiglipVisionModel)
model_abs_path = context.models.get_absolute_path(siglip_model_config)
processor = SiglipImageProcessor.from_pretrained(model_abs_path, local_files_only=True)
assert isinstance(processor, SiglipImageProcessor)
siglip_pipeline = SigLipPipeline(processor, model)
return siglip_pipeline.encode_image(
x=image, device=TorchDevice.choose_torch_device(), dtype=TorchDevice.choose_torch_dtype()
)

View File

@ -1,13 +1,14 @@
from pathlib import Path
from typing import Optional
from transformers import SiglipVisionModel
from invokeai.backend.model_manager.config import (
AnyModelConfig,
)
from invokeai.backend.model_manager.load.load_default import ModelLoader
from invokeai.backend.model_manager.load.model_loader_registry import ModelLoaderRegistry
from invokeai.backend.model_manager.taxonomy import AnyModel, BaseModelType, ModelFormat, ModelType, SubModelType
from invokeai.backend.sig_lip.sig_lip_pipeline import SigLipPipeline
@ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.SigLIP, format=ModelFormat.Diffusers)
@ -23,6 +24,5 @@ class SigLIPModelLoader(ModelLoader):
raise ValueError("Unexpected submodel requested for LLaVA OneVision model.")
model_path = Path(config.path)
model = SigLipPipeline.load_from_path(model_path)
model.to(dtype=self._torch_dtype)
model = SiglipVisionModel.from_pretrained(model_path, local_files_only=True, torch_dtype=self._torch_dtype)
return model

View File

@ -16,11 +16,9 @@ from invokeai.backend.image_util.depth_anything.depth_anything_pipeline import D
from invokeai.backend.image_util.grounding_dino.grounding_dino_pipeline import GroundingDinoPipeline
from invokeai.backend.image_util.segment_anything.segment_anything_pipeline import SegmentAnythingPipeline
from invokeai.backend.ip_adapter.ip_adapter import IPAdapter
from invokeai.backend.llava_onevision_model import LlavaOnevisionModel
from invokeai.backend.model_manager.taxonomy import AnyModel
from invokeai.backend.onnx.onnx_runtime import IAIOnnxRuntimeModel
from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
from invokeai.backend.sig_lip.sig_lip_pipeline import SigLipPipeline
from invokeai.backend.spandrel_image_to_image_model import SpandrelImageToImageModel
from invokeai.backend.textual_inversion import TextualInversionModelRaw
from invokeai.backend.util.calc_tensor_size import calc_tensor_size
@ -51,8 +49,6 @@ def calc_model_size_by_data(logger: logging.Logger, model: AnyModel) -> int:
GroundingDinoPipeline,
SegmentAnythingPipeline,
DepthAnythingPipeline,
SigLipPipeline,
LlavaOnevisionModel,
),
):
return model.calc_size()

View File

@ -1,14 +1,9 @@
from pathlib import Path
from typing import Optional
import torch
from PIL import Image
from transformers import SiglipImageProcessor, SiglipVisionModel
from invokeai.backend.raw_model import RawModel
class SigLipPipeline(RawModel):
class SigLipPipeline:
"""A wrapper for a SigLIP model + processor."""
def __init__(
@ -19,25 +14,7 @@ class SigLipPipeline(RawModel):
self._siglip_processor = siglip_processor
self._siglip_model = siglip_model
@classmethod
def load_from_path(cls, path: str | Path):
siglip_model = SiglipVisionModel.from_pretrained(path, local_files_only=True)
assert isinstance(siglip_model, SiglipVisionModel)
siglip_processor = SiglipImageProcessor.from_pretrained(path, local_files_only=True)
assert isinstance(siglip_processor, SiglipImageProcessor)
return cls(siglip_processor, siglip_model)
def to(self, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None) -> None:
self._siglip_model.to(device=device, dtype=dtype)
def encode_image(self, x: Image.Image, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
imgs = self._siglip_processor.preprocess(images=[x], do_resize=True, return_tensors="pt", do_convert_rgb=True)
encoded_x = self._siglip_model(**imgs.to(device=device, dtype=dtype)).last_hidden_state
return encoded_x
def calc_size(self) -> int:
"""Get size of the model in memory in bytes."""
# HACK(ryand): Fix this issue with circular imports.
from invokeai.backend.model_manager.load.model_util import calc_module_size
return calc_module_size(self._siglip_model)