mirror of
https://github.com/invoke-ai/InvokeAI
synced 2026-04-20 21:56:00 +02:00
Add 'enable_partial_loading' config flag.
This commit is contained in:
parent
535e45cedf
commit
d0bfa019be
@ -107,6 +107,7 @@ class InvokeAIAppConfig(BaseSettings):
|
||||
vram: Amount of VRAM reserved for model storage (GB).
|
||||
lazy_offload: Keep models in VRAM until their space is needed.
|
||||
log_memory_usage: If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.
|
||||
enable_partial_loading: Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. Partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM. If enabling this setting, make sure that your ram and vram cache limits are properly tuned.
|
||||
device: Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.<br>Valid values: `auto`, `cpu`, `cuda`, `cuda:1`, `mps`
|
||||
precision: Floating point precision. `float16` will consume half the memory of `float32` but produce slightly lower-quality images. The `auto` setting will guess the proper precision based on your video card and operating system.<br>Valid values: `auto`, `float16`, `bfloat16`, `float32`
|
||||
sequential_guidance: Whether to calculate guidance in serial instead of in parallel, lowering memory requirements.
|
||||
@ -178,6 +179,7 @@ class InvokeAIAppConfig(BaseSettings):
|
||||
vram: float = Field(default=DEFAULT_VRAM_CACHE, ge=0, description="Amount of VRAM reserved for model storage (GB).")
|
||||
lazy_offload: bool = Field(default=True, description="Keep models in VRAM until their space is needed.")
|
||||
log_memory_usage: bool = Field(default=False, description="If True, a memory snapshot will be captured before and after every model cache operation, and the result will be logged (at debug level). There is a time cost to capturing the memory snapshots, so it is recommended to only enable this feature if you are actively inspecting the model cache's behaviour.")
|
||||
enable_partial_loading: bool = Field(default=False, description="Enable partial loading of models. This enables models to run with reduced VRAM requirements (at the cost of slower speed) by streaming the model from RAM to VRAM as its used. Partial loading can cause models to run more slowly if they were previously being fully loaded into VRAM. If enabling this setting, make sure that your ram and vram cache limits are properly tuned.")
|
||||
|
||||
# DEVICE
|
||||
device: DEVICE = Field(default="auto", description="Preferred execution device. `auto` will choose the device depending on the hardware platform and the installed torch capabilities.")
|
||||
|
||||
@ -84,6 +84,7 @@ class ModelManagerService(ModelManagerServiceBase):
|
||||
ram_cache = ModelCache(
|
||||
max_ram_cache_size_gb=app_config.ram,
|
||||
max_vram_cache_size_gb=app_config.vram,
|
||||
enable_partial_loading=app_config.enable_partial_loading,
|
||||
logger=logger,
|
||||
execution_device=execution_device or TorchDevice.choose_torch_device(),
|
||||
)
|
||||
|
||||
@ -76,6 +76,7 @@ class ModelCache:
|
||||
self,
|
||||
max_ram_cache_size_gb: float,
|
||||
max_vram_cache_size_gb: float,
|
||||
enable_partial_loading: bool,
|
||||
execution_device: torch.device | str = "cuda",
|
||||
storage_device: torch.device | str = "cpu",
|
||||
log_memory_usage: bool = False,
|
||||
@ -102,6 +103,7 @@ class ModelCache:
|
||||
|
||||
self._max_ram_cache_size_gb = max_ram_cache_size_gb
|
||||
self._max_vram_cache_size_gb = max_vram_cache_size_gb
|
||||
self._enable_partial_loading = enable_partial_loading
|
||||
|
||||
self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
|
||||
self._log_memory_usage = log_memory_usage
|
||||
@ -142,7 +144,7 @@ class ModelCache:
|
||||
running_with_cuda = self._execution_device.type == "cuda"
|
||||
|
||||
# Wrap model.
|
||||
if isinstance(model, torch.nn.Module) and running_with_cuda:
|
||||
if isinstance(model, torch.nn.Module) and running_with_cuda and self._enable_partial_loading:
|
||||
wrapped_model = CachedModelWithPartialLoad(model, self._execution_device)
|
||||
else:
|
||||
wrapped_model = CachedModelOnlyFullLoad(model, self._execution_device, size)
|
||||
|
||||
@ -94,6 +94,7 @@ def mm2_loader(mm2_app_config: InvokeAIAppConfig) -> ModelLoadServiceBase:
|
||||
logger=InvokeAILogger.get_logger(),
|
||||
max_ram_cache_size_gb=mm2_app_config.ram,
|
||||
max_vram_cache_size_gb=mm2_app_config.vram,
|
||||
enable_partial_loading=mm2_app_config.enable_partial_loading,
|
||||
)
|
||||
return ModelLoadService(
|
||||
app_config=mm2_app_config,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user