From d0bfa019be04219136291cd4f397c9457a4f0caf Mon Sep 17 00:00:00 2001
From: Ryan Dick <ryanjdick3@gmail.com>
Date: Mon, 30 Dec 2024 19:52:10 +0000
Subject: [PATCH] Add 'enable_partial_loading' config flag.

---
 invokeai/app/services/config/config_default.py                | 2 ++
 invokeai/app/services/model_manager/model_manager_default.py  | 1 +
 .../backend/model_manager/load/model_cache/model_cache.py     | 4 +++-
 tests/backend/model_manager/model_manager_fixtures.py         | 1 +
 4 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index 4c0333a260..52653de0f4 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -107,6 +107,7 @@ class InvokeAIAppConfig(BaseSettings):
         vram: Amount of VRAM reserved for model storage (GB).
         lazy_offload: Keep models in VRAM until their space is needed.
         log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
+        enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. Partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM. If enabling this setting, make sure that your ram and vram cache limits are properly tuned.
         device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda`, `cuda:1`, `mps`
         precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.<br>Valid values: `auto`, `float16`, `bfloat16`, `float32`
         sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.
@@ -178,6 +179,7 @@ class InvokeAIAppConfig(BaseSettings):
     vram:                          float = Field(default=DEFAULT_VRAM_CACHE, ge=0, description="Amount of VRAM reserved for model storage (GB).")
     lazy_offload:                  bool = Field(default=True,               description="Keep models in VRAM until their space is needed.")
     log_memory_usage:              bool = Field(default=False,              description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
+    enable_partial_loading:        bool = Field(default=False,              description="Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. Partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM. If enabling this setting, make sure that your ram and vram cache limits are properly tuned.")
 
     # DEVICE
     device:                      DEVICE = Field(default="auto",             description="Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.")
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index bdd1f5da43..c7bcd43d7a 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -84,6 +84,7 @@ class ModelManagerService(ModelManagerServiceBase):
         ram_cache = ModelCache(
             max_ram_cache_size_gb=app_config.ram,
             max_vram_cache_size_gb=app_config.vram,
+            enable_partial_loading=app_config.enable_partial_loading,
             logger=logger,
             execution_device=execution_device or TorchDevice.choose_torch_device(),
         )
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache.py b/invokeai/backend/model_manager/load/model_cache/model_cache.py
index ecf3ffa623..377f4910b4 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py
@@ -76,6 +76,7 @@ class ModelCache:
         self,
         max_ram_cache_size_gb: float,
         max_vram_cache_size_gb: float,
+        enable_partial_loading: bool,
         execution_device: torch.device | str = "cuda",
         storage_device: torch.device | str = "cpu",
         log_memory_usage: bool = False,
@@ -102,6 +103,7 @@ class ModelCache:
 
         self._max_ram_cache_size_gb = max_ram_cache_size_gb
         self._max_vram_cache_size_gb = max_vram_cache_size_gb
+        self._enable_partial_loading = enable_partial_loading
 
         self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
         self._log_memory_usage = log_memory_usage
@@ -142,7 +144,7 @@ class ModelCache:
         running_with_cuda = self._execution_device.type == "cuda"
 
         # Wrap model.
-        if isinstance(model, torch.nn.Module) and running_with_cuda:
+        if isinstance(model, torch.nn.Module) and running_with_cuda and self._enable_partial_loading:
             wrapped_model = CachedModelWithPartialLoad(model, self._execution_device)
         else:
             wrapped_model = CachedModelOnlyFullLoad(model, self._execution_device, size)
diff --git a/tests/backend/model_manager/model_manager_fixtures.py b/tests/backend/model_manager/model_manager_fixtures.py
index f396a93d2d..4449bbaf62 100644
--- a/tests/backend/model_manager/model_manager_fixtures.py
+++ b/tests/backend/model_manager/model_manager_fixtures.py
@@ -94,6 +94,7 @@ def mm2_loader(mm2_app_config: InvokeAIAppConfig) -> ModelLoadServiceBase:
         logger=InvokeAILogger.get_logger(),
         max_ram_cache_size_gb=mm2_app_config.ram,
         max_vram_cache_size_gb=mm2_app_config.vram,
+        enable_partial_loading=mm2_app_config.enable_partial_loading,
     )
     return ModelLoadService(
         app_config=mm2_app_config,