Add 'enable_partial_loading' config flag.

This commit is contained in:
Ryan Dick 2024-12-30 19:52:10 +00:00
parent 535e45cedf
commit d0bfa019be
4 changed files with 7 additions and 1 deletions

View File

@ -107,6 +107,7 @@ class InvokeAIAppConfig(BaseSettings):
vram: Amount of VRAM reserved for model storage (GB).
lazy_offload: Keep models in VRAM until their space is needed.
log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. Partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM. If enabling this setting, make sure that your ram and vram cache limits are properly tuned.
device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda`, `cuda:1`, `mps`
precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.<br>Valid values: `auto`, `float16`, `bfloat16`, `float32`
sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.
@ -178,6 +179,7 @@ class InvokeAIAppConfig(BaseSettings):
vram: float = Field(default=DEFAULT_VRAM_CACHE, ge=0, description="Amount of VRAM reserved for model storage (GB).")
lazy_offload: bool = Field(default=True, description="Keep models in VRAM until their space is needed.")
log_memory_usage: bool = Field(default=False, description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
enable_partial_loading: bool = Field(default=False, description="Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. Partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM. If enabling this setting, make sure that your ram and vram cache limits are properly tuned.")
# DEVICE
device: DEVICE = Field(default="auto", description="Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.")

View File

@ -84,6 +84,7 @@ class ModelManagerService(ModelManagerServiceBase):
ram_cache = ModelCache(
max_ram_cache_size_gb=app_config.ram,
max_vram_cache_size_gb=app_config.vram,
enable_partial_loading=app_config.enable_partial_loading,
logger=logger,
execution_device=execution_device or TorchDevice.choose_torch_device(),
)

View File

@ -76,6 +76,7 @@ class ModelCache:
self,
max_ram_cache_size_gb: float,
max_vram_cache_size_gb: float,
enable_partial_loading: bool,
execution_device: torch.device | str = "cuda",
storage_device: torch.device | str = "cpu",
log_memory_usage: bool = False,
@ -102,6 +103,7 @@ class ModelCache:
self._max_ram_cache_size_gb = max_ram_cache_size_gb
self._max_vram_cache_size_gb = max_vram_cache_size_gb
self._enable_partial_loading = enable_partial_loading
self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
self._log_memory_usage = log_memory_usage
@ -142,7 +144,7 @@ class ModelCache:
running_with_cuda = self._execution_device.type == "cuda"
# Wrap model.
if isinstance(model, torch.nn.Module) and running_with_cuda:
if isinstance(model, torch.nn.Module) and running_with_cuda and self._enable_partial_loading:
wrapped_model = CachedModelWithPartialLoad(model, self._execution_device)
else:
wrapped_model = CachedModelOnlyFullLoad(model, self._execution_device, size)

View File

@ -94,6 +94,7 @@ def mm2_loader(mm2_app_config: InvokeAIAppConfig) -> ModelLoadServiceBase:
logger=InvokeAILogger.get_logger(),
max_ram_cache_size_gb=mm2_app_config.ram,
max_vram_cache_size_gb=mm2_app_config.vram,
enable_partial_loading=mm2_app_config.enable_partial_loading,
)
return ModelLoadService(
app_config=mm2_app_config,