gpt-pretrain/lit_module.py

from functools import cache
from typing import Dict, Optional

import pytorch_lightning as pl
import torch
import torchmetrics

from utils import init_model
from custom_models.gpt2.modeling_gpt2 import GPT2LMHeadModel

from transformers import AutoConfig


class LitModule(pl.LightningModule):
    def __init__(
        self,
        model_name: str,
        path: str = "",
        learning_rate: float = 0.0001,
        use_tril_attention_mask: str = False,
    ):
        super().__init__()
        self.save_hyperparameters()
        if path != "":
            config = AutoConfig.for_model(model_type=model_name)
            model = GPT2LMHeadModel(config)
            model = model.from_pretrained(path)
            self.llm = self.register_core_module(model)
        else:
            self.llm = self.register_core_module(init_model(model_name))
        self.learning_rate = learning_rate
        self.use_tril_attention_mask = use_tril_attention_mask
        self.metric_loss = torchmetrics.MeanMetric()
        self.metric_accuracy = torchmetrics.Accuracy(
            task="multiclass",
            num_classes=self.llm.config.vocab_size,
        )

    @cache
    def get_batch_tril_matrix(self, block_size: int, batch_size: Optional[int] = None) -> torch.Tensor:
        matrix = torch.ones(block_size, block_size).tril()
        if batch_size is not None:
            matrix = matrix.repeat(batch_size, 1, 1)
        return matrix

    def register_core_module(self, module: torch.nn.Module) -> torch.nn.Module:
        object.__setattr__(self, "__core_module__", module)
        return module

    def training_step(self, batch: Dict[str, torch.Tensor], batch_idx):
        batch_size, block_size = batch["input_ids"].shape
        if self.use_tril_attention_mask:
            batch["attention_mask"] = self.get_batch_tril_matrix(block_size, batch_size=batch_size).to(self.device)
        outputs = self.llm(**batch, return_dict=True)
        loss = outputs.loss

        self.log("train_loss", loss, rank_zero_only=True)

        return loss

    def validation_step(self, batch: Dict[str, torch.Tensor], batch_idx):
        outputs = self.llm(**batch, return_dict=True)
        loss = outputs.loss
        logits = outputs.logits[..., :-1, :]
        labels = batch["labels"][..., 1:]

        self.metric_loss.update(loss)

        label_mask = labels != -100
        self.metric_accuracy.update(logits[label_mask], labels[label_mask])

    def on_validation_epoch_end(self) -> None:
        self.log("val_loss", self.metric_loss, rank_zero_only=True)
        self.log("accuracy", self.metric_accuracy, rank_zero_only=True)

    def configure_optimizers(self):
        strategy = self.trainer.strategy
        if isinstance(strategy, pl.strategies.DeepSpeedStrategy):
            assert "optimizer" not in strategy.config
            zero_config = strategy.config.get("zero_optimization")
            if zero_config is not None:
                if "offload_optimizer" in zero_config:
                    import deepspeed

                    optimizer = deepspeed.ops.adam.DeepSpeedCPUAdam(
                        self.trainer.model.parameters(), lr=self.learning_rate
                    )
                    return optimizer
        optimizer = torch.optim.AdamW(self.trainer.model.parameters(), lr=self.learning_rate)
        return optimizer

    def configure_callbacks(self):
        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            monitor="accuracy",
            mode="max",
            filename="{epoch:02d}-{accuracy:.4f}",
        )
        early_stop_callback = pl.callbacks.EarlyStopping(
            monitor="accuracy",
            min_delta=0.001,
            patience=3,
            mode="max",
            stopping_threshold=1,
        )
        return [checkpoint_callback, early_stop_callback]
[code] refactor 2023-05-07 13:01:02 +08:00			`from functools import cache`
			`from typing import Dict, Optional`

			`import pytorch_lightning as pl`
			`import torch`
			`import torchmetrics`

			`from utils import init_model`
init gpt train without download. 2024-02-24 13:40:39 +08:00			`from custom_models.gpt2.modeling_gpt2 import GPT2LMHeadModel`
[code] refactor 2023-05-07 13:01:02 +08:00
set use local dataset. 2024-02-24 13:44:22 +08:00			`from transformers import AutoConfig`

[code] refactor 2023-05-07 13:01:02 +08:00
			`class LitModule(pl.LightningModule):`
[feature] new args learning_rate max_epochs 2023-05-09 00:02:29 +08:00			`def __init__(`
			`self,`
			`model_name: str,`
init gpt train without download. 2024-02-24 13:40:39 +08:00			`path: str = "",`
[feature] new args learning_rate max_epochs 2023-05-09 00:02:29 +08:00			`learning_rate: float = 0.0001,`
			`use_tril_attention_mask: str = False,`
			`):`
[code] refactor 2023-05-07 13:01:02 +08:00			`super().__init__()`
			`self.save_hyperparameters()`
set use local dataset. 2024-02-24 13:44:22 +08:00			`if path != "":`
init gpt train without download. 2024-02-24 13:40:39 +08:00			`config = AutoConfig.for_model(model_type=model_name)`
			`model = GPT2LMHeadModel(config)`
			`model = model.from_pretrained(path)`
			`self.llm = self.register_core_module(model)`
			`else:`
			`self.llm = self.register_core_module(init_model(model_name))`
[feature] new args learning_rate max_epochs 2023-05-09 00:02:29 +08:00			`self.learning_rate = learning_rate`
[code] refactor 2023-05-07 13:01:02 +08:00			`self.use_tril_attention_mask = use_tril_attention_mask`
			`self.metric_loss = torchmetrics.MeanMetric()`
			`self.metric_accuracy = torchmetrics.Accuracy(`
set use local dataset. 2024-02-24 13:44:22 +08:00			`task="multiclass",`
[code] refactor 2023-05-07 13:01:02 +08:00			`num_classes=self.llm.config.vocab_size,`
			`)`

			`@cache`
[code] formatter-caused changes 2023-05-28 20:02:56 +08:00			`def get_batch_tril_matrix(self, block_size: int, batch_size: Optional[int] = None) -> torch.Tensor:`
[code] refactor 2023-05-07 13:01:02 +08:00			`matrix = torch.ones(block_size, block_size).tril()`
			`if batch_size is not None:`
			`matrix = matrix.repeat(batch_size, 1, 1)`
			`return matrix`

			`def register_core_module(self, module: torch.nn.Module) -> torch.nn.Module:`
set use local dataset. 2024-02-24 13:44:22 +08:00			`object.__setattr__(self, "__core_module__", module)`
[code] refactor 2023-05-07 13:01:02 +08:00			`return module`

			`def training_step(self, batch: Dict[str, torch.Tensor], batch_idx):`
set use local dataset. 2024-02-24 13:44:22 +08:00			`batch_size, block_size = batch["input_ids"].shape`
[code] refactor 2023-05-07 13:01:02 +08:00			`if self.use_tril_attention_mask:`
set use local dataset. 2024-02-24 13:44:22 +08:00			`batch["attention_mask"] = self.get_batch_tril_matrix(block_size, batch_size=batch_size).to(self.device)`
[code] refactor 2023-05-07 13:01:02 +08:00			`outputs = self.llm(**batch, return_dict=True)`
			`loss = outputs.loss`

set use local dataset. 2024-02-24 13:44:22 +08:00			`self.log("train_loss", loss, rank_zero_only=True)`
[code] refactor 2023-05-07 13:01:02 +08:00
			`return loss`

			`def validation_step(self, batch: Dict[str, torch.Tensor], batch_idx):`
			`outputs = self.llm(**batch, return_dict=True)`
			`loss = outputs.loss`
			`logits = outputs.logits[..., :-1, :]`
set use local dataset. 2024-02-24 13:44:22 +08:00			`labels = batch["labels"][..., 1:]`
[code] refactor 2023-05-07 13:01:02 +08:00
			`self.metric_loss.update(loss)`

			`label_mask = labels != -100`
			`self.metric_accuracy.update(logits[label_mask], labels[label_mask])`

			`def on_validation_epoch_end(self) -> None:`
set use local dataset. 2024-02-24 13:44:22 +08:00			`self.log("val_loss", self.metric_loss, rank_zero_only=True)`
			`self.log("accuracy", self.metric_accuracy, rank_zero_only=True)`
[code] refactor 2023-05-07 13:01:02 +08:00
			`def configure_optimizers(self):`
[fix] add patch to fix DeepSpeedStrategy offload 'zero_force_ds_cpu_optimizer' issue 2023-05-09 23:00:28 +08:00			`strategy = self.trainer.strategy`
			`if isinstance(strategy, pl.strategies.DeepSpeedStrategy):`
			`assert "optimizer" not in strategy.config`
			`zero_config = strategy.config.get("zero_optimization")`
			`if zero_config is not None:`
			`if "offload_optimizer" in zero_config:`
			`import deepspeed`

			`optimizer = deepspeed.ops.adam.DeepSpeedCPUAdam(`
			`self.trainer.model.parameters(), lr=self.learning_rate`
			`)`
			`return optimizer`
[code] formatter-caused changes 2023-05-28 20:02:56 +08:00			`optimizer = torch.optim.AdamW(self.trainer.model.parameters(), lr=self.learning_rate)`
[code] refactor 2023-05-07 13:01:02 +08:00			`return optimizer`

			`def configure_callbacks(self):`
			`checkpoint_callback = pl.callbacks.ModelCheckpoint(`
set use local dataset. 2024-02-24 13:44:22 +08:00			`monitor="accuracy",`
			`mode="max",`
			`filename="{epoch:02d}-{accuracy:.4f}",`
[code] refactor 2023-05-07 13:01:02 +08:00			`)`
			`early_stop_callback = pl.callbacks.EarlyStopping(`
set use local dataset. 2024-02-24 13:44:22 +08:00			`monitor="accuracy",`
[code] refactor 2023-05-07 13:01:02 +08:00			`min_delta=0.001,`
			`patience=3,`
set use local dataset. 2024-02-24 13:44:22 +08:00			`mode="max",`
[code] refactor 2023-05-07 13:01:02 +08:00			`stopping_threshold=1,`
			`)`
			`return [checkpoint_callback, early_stop_callback]`