Add keep_ram_copy_of_weights config option.

2026-04-05 06:25:11 +02:00 · 2025-01-16 15:14:52 +00:00 · 2025-01-16 15:14:52 +00:00 · 36a3869af0
commit 36a3869af0
parent c76d08d1fd
4 changed files with 12 additions and 2 deletions
--- a/invokeai/app/services/config/config_default.py
+++ b/invokeai/app/services/config/config_default.py
@ -87,6 +87,7 @@ class InvokeAIAppConfig(BaseSettings):
        log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
        device_working_mem_gb: The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.
        enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.
+        keep_ram_copy_of_weights: Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high, set it to True for improved speed if there is RAM to spare.
        ram: DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_ram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.
        vram: DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_vram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.
        lazy_offload: DEPRECATED: This setting is no longer used. Lazy-offloading is enabled by default. This config setting will be removed once the new model cache behavior is stable.
@ -162,6 +163,7 @@ class InvokeAIAppConfig(BaseSettings):
    log_memory_usage:              bool = Field(default=False,              description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
    device_working_mem_gb:        float = Field(default=3,                  description="The amount of working memory to keep available on the compute device (in GB). Has no effect if running on CPU. If you are experiencing OOM errors, try increasing this value.")
    enable_partial_loading:        bool = Field(default=False,              description="Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. In some edge cases, partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM.")
+    keep_ram_copy_of_weights:      bool = Field(default=True,              description="Whether to keep a full RAM copy of a model's weights when the model is loaded in VRAM. Keeping a RAM copy increases average RAM usage, but speeds up model switching and LoRA patching (assuming there is sufficient RAM). Set this to False if RAM pressure is consistently high.")
    # Deprecated CACHE configs
    ram:                Optional[float] = Field(default=None, gt=0,         description="DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_ram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.")
    vram:               Optional[float] = Field(default=None, ge=0,         description="DEPRECATED: This setting is no longer used. It has been replaced by `max_cache_vram_gb`, but most users will not need to use this config since automatic cache size limits should work well in most cases. This config setting will be removed once the new model cache behavior is stable.")
--- a/invokeai/app/services/model_manager/model_manager_default.py
+++ b/invokeai/app/services/model_manager/model_manager_default.py
@ -84,6 +84,7 @@ class ModelManagerService(ModelManagerServiceBase):
        ram_cache = ModelCache(
            execution_device_working_mem_gb=app_config.device_working_mem_gb,
            enable_partial_loading=app_config.enable_partial_loading,
+            keep_ram_copy_of_weights=app_config.keep_ram_copy_of_weights,
            max_ram_cache_size_gb=app_config.max_cache_ram_gb,
            max_vram_cache_size_gb=app_config.max_cache_vram_gb,
            execution_device=execution_device or TorchDevice.choose_torch_device(),
--- a/invokeai/backend/model_manager/load/model_cache/model_cache.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache.py
@ -78,6 +78,7 @@ class ModelCache:
        self,
        execution_device_working_mem_gb: float,
        enable_partial_loading: bool,
+        keep_ram_copy_of_weights: bool,
        max_ram_cache_size_gb: float | None = None,
        max_vram_cache_size_gb: float | None = None,
        execution_device: torch.device | str = "cuda",
@ -105,6 +106,7 @@ class ModelCache:
        :param logger: InvokeAILogger to use (otherwise creates one)
        """
        self._enable_partial_loading = enable_partial_loading
+        self._keep_ram_copy_of_weights = keep_ram_copy_of_weights
        self._execution_device_working_mem_gb = execution_device_working_mem_gb
        self._execution_device: torch.device = torch.device(execution_device)
        self._storage_device: torch.device = torch.device(storage_device)
@ -154,9 +156,13 @@ class ModelCache:

        # Wrap model.
        if isinstance(model, torch.nn.Module) and running_with_cuda and self._enable_partial_loading:
-            wrapped_model = CachedModelWithPartialLoad(model, self._execution_device, keep_ram_copy=False)
+            wrapped_model = CachedModelWithPartialLoad(
+                model, self._execution_device, keep_ram_copy=self._keep_ram_copy_of_weights
+            )
        else:
-            wrapped_model = CachedModelOnlyFullLoad(model, self._execution_device, size)
+            wrapped_model = CachedModelOnlyFullLoad(
+                model, self._execution_device, size, keep_ram_copy=self._keep_ram_copy_of_weights
+            )

        cache_record = CacheRecord(key=key, cached_model=wrapped_model)
        self._cached_models[key] = cache_record
--- a/tests/backend/model_manager/model_manager_fixtures.py
+++ b/tests/backend/model_manager/model_manager_fixtures.py
@ -94,6 +94,7 @@ def mm2_loader(mm2_app_config: InvokeAIAppConfig) -> ModelLoadServiceBase:
    ram_cache = ModelCache(
        execution_device_working_mem_gb=mm2_app_config.device_working_mem_gb,
        enable_partial_loading=mm2_app_config.enable_partial_loading,
+        keep_ram_copy_of_weights=mm2_app_config.keep_ram_copy_of_weights,
        max_ram_cache_size_gb=mm2_app_config.max_cache_ram_gb,
        max_vram_cache_size_gb=mm2_app_config.max_cache_vram_gb,
        execution_device=TorchDevice.choose_torch_device(),