From 5e6b747baf98c051a4232822d294d3970e6fe732 Mon Sep 17 00:00:00 2001 From: Yiqing-Zhou Date: Tue, 9 May 2023 23:00:28 +0800 Subject: [PATCH] [fix] add patch to fix DeepSpeedStrategy offload 'zero_force_ds_cpu_optimizer' issue --- lit_module.py | 12 +++++ lit_patches.py | 133 ++++++++++++++++++++++++++++++++++++++++++++++++- lit_train.py | 2 +- 3 files changed, 144 insertions(+), 3 deletions(-) diff --git a/lit_module.py b/lit_module.py index f2390dd..c1c5393 100644 --- a/lit_module.py +++ b/lit_module.py @@ -68,6 +68,18 @@ class LitModule(pl.LightningModule): self.log('accuracy', self.metric_accuracy, rank_zero_only=True) def configure_optimizers(self): + strategy = self.trainer.strategy + if isinstance(strategy, pl.strategies.DeepSpeedStrategy): + assert "optimizer" not in strategy.config + zero_config = strategy.config.get("zero_optimization") + if zero_config is not None: + if "offload_optimizer" in zero_config: + import deepspeed + + optimizer = deepspeed.ops.adam.DeepSpeedCPUAdam( + self.trainer.model.parameters(), lr=self.learning_rate + ) + return optimizer optimizer = torch.optim.AdamW( self.trainer.model.parameters(), lr=self.learning_rate ) diff --git a/lit_patches.py b/lit_patches.py index 6f9d1f1..01f50cf 100644 --- a/lit_patches.py +++ b/lit_patches.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Union import pytorch_lightning as pl from torch.nn import Module @@ -47,5 +47,134 @@ class FSDPStrategy(pl.strategies.FSDPStrategy): cls._registered_strategies.append("fsdp_cpu_offload") -def apply_all_patches(): +class DeepSpeedStrategy(pl.strategies.DeepSpeedStrategy): + def _create_default_config( + self, + zero_optimization: bool, + zero_allow_untested_optimizer: bool, + logging_batch_size_per_gpu: Union[str, int], + partition_activations: bool, + cpu_checkpointing: bool, + contiguous_memory_optimization: bool, + synchronize_checkpoint_boundary: bool, + offload_optimizer: bool, + offload_parameters: bool, + nvme_path: str, + offload_params_device: str, + params_buffer_count: int, + params_buffer_size: int, + max_in_cpu: int, + offload_optimizer_device: str, + optimizer_buffer_count: int, + pin_memory: bool, + block_size: int, + queue_depth: int, + single_submit: bool, + overlap_events: bool, + thread_count: int, + **zero_kwargs: Any, + ) -> Dict: + cfg = super()._create_default_config( + zero_optimization, + zero_allow_untested_optimizer, + logging_batch_size_per_gpu, + partition_activations, + cpu_checkpointing, + contiguous_memory_optimization, + synchronize_checkpoint_boundary, + offload_optimizer, + offload_parameters, + nvme_path, + offload_params_device, + params_buffer_count, + params_buffer_size, + max_in_cpu, + offload_optimizer_device, + optimizer_buffer_count, + pin_memory, + block_size, + queue_depth, + single_submit, + overlap_events, + thread_count, + **zero_kwargs, + ) + if zero_optimization: + if offload_parameters: + cfg = { + "zero_force_ds_cpu_optimizer": False, + **cfg, + } + return cfg + + @classmethod + def register_strategies(cls, strategy_registry: Dict) -> None: + strategy_registry.register( + "deepspeed", + cls, + description="Default DeepSpeed Strategy", + override=True, + ) + strategy_registry.register( + "deepspeed_stage_1", + cls, + description="DeepSpeed with ZeRO Stage 1 enabled", + stage=1, + override=True, + ) + strategy_registry.register( + "deepspeed_stage_2", + cls, + description="DeepSpeed with ZeRO Stage 2 enabled", + stage=2, + override=True, + ) + strategy_registry.register( + "deepspeed_stage_2_offload", + cls, + description="DeepSpeed ZeRO Stage 2 and CPU Offload", + stage=2, + offload_optimizer=True, + override=True, + ) + strategy_registry.register( + "deepspeed_stage_3", + cls, + description="DeepSpeed ZeRO Stage 3", + stage=3, + override=True, + ) + strategy_registry.register( + "deepspeed_stage_3_offload", + cls, + description="DeepSpeed ZeRO Stage 3 and CPU Offload", + stage=3, + offload_optimizer=True, + offload_parameters=True, + override=True, + ) + strategy_registry.register( + "deepspeed_stage_3_offload_nvme", + cls, + description="DeepSpeed ZeRO Stage 3 and NVMe Offload", + stage=3, + offload_optimizer=True, + offload_parameters=True, + remote_device="nvme", + offload_params_device="nvme", + offload_optimizer_device="nvme", + override=True, + ) + + +def apply_fsdp_strategy_patch(): FSDPStrategy.register_strategies(pl.strategies.StrategyRegistry) + + +def apply_deepspeed_strategy_patch(): + DeepSpeedStrategy.register_strategies(pl.strategies.StrategyRegistry) + + +def apply_all_patches(): + apply_fsdp_strategy_patch() + apply_deepspeed_strategy_patch() diff --git a/lit_train.py b/lit_train.py index cfead59..560a2e7 100644 --- a/lit_train.py +++ b/lit_train.py @@ -150,7 +150,7 @@ def parse_args(): ) parser.add_argument( "--seed", - type=str, + type=int, help="Random seed", default=42, )