From 36a3869af03ae46cbf53a28064293a038933b389 Mon Sep 17 00:00:00 2001
From: Ryan Dick <ryanjdick3@gmail.com>
Date: Thu, 16 Jan 2025 15:14:52 +0000
Subject: [PATCH] Add keep_ram_copy_of_weights config option.

---
 invokeai/app/services/config/config_default.py         |  2 ++
 .../services/model_manager/model_manager_default.py    |  1 +
 .../model_manager/load/model_cache/model_cache.py      | 10 ++++++++--
 tests/backend/model_manager/model_manager_fixtures.py  |  1 +
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/invokeai/app/services/config/config_default.py b/invokeai/app/services/config/config_default.py
index 6d95f2f7fa..c3d78bc52c 100644
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@@ -87,6 +87,7 @@ class InvokeAIAppConfig(BaseSettings):
         log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
         device_working_mem_gb: The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.
         enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.
+        keep_ram_copy_of_weights: Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high, set it to True for improved speed if there is RAM to spare.
         ram: DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_ram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.
         vram: DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_vram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.
         lazy_offload: DEPRECATED: This setting is no longer used. Lazy-offloading is enabled by default. This config setting will be removed once the new model cache behavior is stable.
@@ -162,6 +163,7 @@ class InvokeAIAppConfig(BaseSettings):
     log_memory_usage:              bool = Field(default=False,              description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
     device_working_mem_gb:        float = Field(default=3,                  description="The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.")
     enable_partial_loading:        bool = Field(default=False,              description="Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.")
+    keep_ram_copy_of_weights:      bool = Field(default=True,              description="Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.")
     # Deprecated CACHE configs
     ram:                Optional[float] = Field(default=None, gt=0,         description="DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_ram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.")
     vram:               Optional[float] = Field(default=None, ge=0,         description="DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_vram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.")
diff --git a/invokeai/app/services/model_manager/model_manager_default.py b/invokeai/app/services/model_manager/model_manager_default.py
index cec3b0bc18..9ad10c5e73 100644
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@@ -84,6 +84,7 @@ class ModelManagerService(ModelManagerServiceBase):
         ram_cache = ModelCache(
             execution_device_working_mem_gb=app_config.device_working_mem_gb,
             enable_partial_loading=app_config.enable_partial_loading,
+            keep_ram_copy_of_weights=app_config.keep_ram_copy_of_weights,
             max_ram_cache_size_gb=app_config.max_cache_ram_gb,
             max_vram_cache_size_gb=app_config.max_cache_vram_gb,
             execution_device=execution_device or TorchDevice.choose_torch_device(),
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache.py b/invokeai/backend/model_manager/load/model_cache/model_cache.py
index cf33cc9cfe..5ef6aefe61 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py
@@ -78,6 +78,7 @@ class ModelCache:
         self,
         execution_device_working_mem_gb: float,
         enable_partial_loading: bool,
+        keep_ram_copy_of_weights: bool,
         max_ram_cache_size_gb: float | None = None,
         max_vram_cache_size_gb: float | None = None,
         execution_device: torch.device | str = "cuda",
@@ -105,6 +106,7 @@ class ModelCache:
         :param logger: InvokeAILogger to use (otherwise creates one)
         """
         self._enable_partial_loading = enable_partial_loading
+        self._keep_ram_copy_of_weights = keep_ram_copy_of_weights
         self._execution_device_working_mem_gb = execution_device_working_mem_gb
         self._execution_device: torch.device = torch.device(execution_device)
         self._storage_device: torch.device = torch.device(storage_device)
@@ -154,9 +156,13 @@ class ModelCache:
 
         # Wrap model.
         if isinstance(model, torch.nn.Module) and running_with_cuda and self._enable_partial_loading:
-            wrapped_model = CachedModelWithPartialLoad(model, self._execution_device, keep_ram_copy=False)
+            wrapped_model = CachedModelWithPartialLoad(
+                model, self._execution_device, keep_ram_copy=self._keep_ram_copy_of_weights
+            )
         else:
-            wrapped_model = CachedModelOnlyFullLoad(model, self._execution_device, size)
+            wrapped_model = CachedModelOnlyFullLoad(
+                model, self._execution_device, size, keep_ram_copy=self._keep_ram_copy_of_weights
+            )
 
         cache_record = CacheRecord(key=key, cached_model=wrapped_model)
         self._cached_models[key] = cache_record
diff --git a/tests/backend/model_manager/model_manager_fixtures.py b/tests/backend/model_manager/model_manager_fixtures.py
index 4a91ea70f4..87d617662e 100644
--- a/tests/backend/model_manager/model_manager_fixtures.py
+++ b/tests/backend/model_manager/model_manager_fixtures.py
@@ -94,6 +94,7 @@ def mm2_loader(mm2_app_config: InvokeAIAppConfig) -> ModelLoadServiceBase:
     ram_cache = ModelCache(
         execution_device_working_mem_gb=mm2_app_config.device_working_mem_gb,
         enable_partial_loading=mm2_app_config.enable_partial_loading,
+        keep_ram_copy_of_weights=mm2_app_config.keep_ram_copy_of_weights,
         max_ram_cache_size_gb=mm2_app_config.max_cache_ram_gb,
         max_vram_cache_size_gb=mm2_app_config.max_cache_vram_gb,
         execution_device=TorchDevice.choose_torch_device(),