Add 'enable_partial_loading' config flag.

2026-04-20 21:56:00 +02:00 · 2024-12-30 19:52:10 +00:00 · 2024-12-30 19:52:10 +00:00 · d0bfa019be
commit d0bfa019be
parent 535e45cedf
4 changed files with 7 additions and 1 deletions
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@ -107,6 +107,7 @@ class InvokeAIAppConfig(BaseSettings):
        vram: Amount of VRAM reserved for model storage (GB).
        lazy_offload: Keep models in VRAM until their space is needed.
        log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
+        enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. Partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM. If enabling this setting, make sure that your ram and vram cache limits are properly tuned.
        device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda`, `cuda:1`, `mps`
        precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.<br>Valid values: `auto`, `float16`, `bfloat16`, `float32`
        sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.
@ -178,6 +179,7 @@ class InvokeAIAppConfig(BaseSettings):
    vram:                          float = Field(default=DEFAULT_VRAM_CACHE, ge=0, description="Amount of VRAM reserved for model storage (GB).")
    lazy_offload:                  bool = Field(default=True,               description="Keep models in VRAM until their space is needed.")
    log_memory_usage:              bool = Field(default=False,              description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
+    enable_partial_loading:        bool = Field(default=False,              description="Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. Partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM. If enabling this setting, make sure that your ram and vram cache limits are properly tuned.")

    # DEVICE
    device:                      DEVICE = Field(default="auto",             description="Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.")
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@ -84,6 +84,7 @@ class ModelManagerService(ModelManagerServiceBase):
        ram_cache = ModelCache(
            max_ram_cache_size_gb=app_config.ram,
            max_vram_cache_size_gb=app_config.vram,
+            enable_partial_loading=app_config.enable_partial_loading,
            logger=logger,
            execution_device=execution_device or TorchDevice.choose_torch_device(),
        )
--- a/invokeai/backend/model_manager/load/model_cache/model_cache.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py
@ -76,6 +76,7 @@ class ModelCache:
        self,
        max_ram_cache_size_gb: float,
        max_vram_cache_size_gb: float,
+        enable_partial_loading: bool,
        execution_device: torch.device | str = "cuda",
        storage_device: torch.device | str = "cpu",
        log_memory_usage: bool = False,
@ -102,6 +103,7 @@ class ModelCache:

        self._max_ram_cache_size_gb = max_ram_cache_size_gb
        self._max_vram_cache_size_gb = max_vram_cache_size_gb
+        self._enable_partial_loading = enable_partial_loading

        self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
        self._log_memory_usage = log_memory_usage
@ -142,7 +144,7 @@ class ModelCache:
        running_with_cuda = self._execution_device.type == "cuda"

        # Wrap model.
-        if isinstance(model, torch.nn.Module) and running_with_cuda:
+        if isinstance(model, torch.nn.Module) and running_with_cuda and self._enable_partial_loading:
            wrapped_model = CachedModelWithPartialLoad(model, self._execution_device)
        else:
            wrapped_model = CachedModelOnlyFullLoad(model, self._execution_device, size)
--- a/tests/backend/model_manager/model_manager_fixtures.py
+++ b/tests/backend/model_manager/model_manager_fixtures.py
@ -94,6 +94,7 @@ def mm2_loader(mm2_app_config: InvokeAIAppConfig) -> ModelLoadServiceBase:
        logger=InvokeAILogger.get_logger(),
        max_ram_cache_size_gb=mm2_app_config.ram,
        max_vram_cache_size_gb=mm2_app_config.vram,
+        enable_partial_loading=mm2_app_config.enable_partial_loading,
    )
    return ModelLoadService(
        app_config=mm2_app_config,