Update more.

2024-07-31 22:04:01 +08:00 · 2024-07-31 22:04:01 +08:00 · d50cb798b6
parent 50e502ae96
commit d50cb798b6
4 changed files with 113 additions and 17 deletions
--- a/unsuper/minist.py
+++ b/unsuper/minist.py
@ -0,0 +1,93 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F  # Add this line
 import torchvision
 import torchvision.transforms as transforms
 # Device configuration
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Hyper-parameters
 num_epochs = 5
 batch_size = 4
 learning_rate = 0.001
 # Dataset has PILImage images of range [0, 1].
 # We transform them to Tensors of normalized range [-1, 1]
 transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
 # CIFAR10: 60000 32x32 color images in 10 classes, with 6000 images per class
 train_dataset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
 test_dataset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
 train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
 test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
 class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        # x = F.relu(self.fc1(x))
        x = self.fc1(x)
        # x = F.relu(self.fc2(x))
        x = self.fc2(x)
        x = self.fc3(x)
        return x
 model = ConvNet().to(device)
 criterion = nn.CrossEntropyLoss()
 optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
 # Train the model
 n_total_steps = len(train_loader)
 for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (i + 1) % 2000 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}")
 print("Finished Training")
 # Test the model
 with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()
    acc = 100.0 * n_correct / n_samples
    print(f"Accuracy of the network on the 10000 test images: {acc} %")
--- a/wit/doc/meaning_dataset.md
+++ b/wit/doc/meaning_dataset.md
@ -10,12 +10,13 @@ meaning数据集是一个模仿自然语言，以及抽象表达的数据集。
 4. 从0到(vocab_size-1)的编号表示基本meaning，是不能被拆解的，也就是token
 5. meaning通过一层层的向低编号的meaning进行组合替换，最终形成一个最底层是token的树形数据
 6. level表示当前token相对于root meaning的距离
-7. rank_idx表示当前token在不同层的排序编号，每4位表示在一层里面的编号，低4位表示最低层级的rank_idx，高位无用的位用1填充
+7. rank
-7. rank_all表示当前token在不同层的分子个数，每4位表示在一层里面的编号，低4位表示最低层级的rank_all，高位无用的位用1填充
+8. rank_idx表示当前token在不同层的排序编号，每4位表示在一层里面的编号，低4位表示最低层级的rank_idx，高位无用的位用1填充
-8. tree用于存储每个meaning的拆解的数据，使用字典表达一个树形结构
+9. rank_all表示当前token在不同层的分子个数，每4位表示在一层里面的编号，低4位表示最低层级的rank_all，高位无用的位用1填充
-9. get_seq_mask返回一个sequence每个token在对应level是不是对应的index,level=0:最底层，index=-1:最后一个，index=0:第一个
+10. tree用于存储每个meaning的拆解的数据，使用字典表达一个树形结构
-10. meaning_height 当前meaning的总高度
+11. get_seq_mask返回一个sequence每个token在对应level是不是对应的index,level=0:最底层，index=-1:最后一个，index=0:第一个
-11. meaning_weight 当前meaning的总宽度
+12. meaning_height 当前meaning的总高度
 13. meaning_weight 当前meaning的总宽度
 ```
@ -31,10 +32,11 @@ vocab_size = 256 meaning = 115200
              / \     / \        / \       / \
            176 11  255 129    129  99   211 111
-sequence = 123 42 32 176 11 255 129 245 233 129 99 23 211 111 93 176
+sequence = 123  42  32 176  11 255 129 245 233 129  99  23 211 111  93 176
-level    =  3   3  2  4   4  4   4   2   2   4   4  3  4   4   3  3
+level    =  3    3   2  4    4  4   4   2   2   4    4   3  4   4    3  3
-idx at 0 =  0   1  1  0   1  0   1   0   1   0   1  2  0   1   0  1
+idx at 0 =  0    1   1  0    1  0   1   0   1   0    1   2  0   1    0  1
-idx at 1 =  0   0  0  0   0  1   1   1   1   0   0  0  0   0   2  2
+idx at 1 =  0    0   0  0    0  1   1   1   1   0    0   0  0   0    2  2
-idx         0   1  1  0   1 16  17  16  17   0   1  2  0   1  32 33
+idx         0    1   1  0    1 16  17  16  17   0    1   2  0   1   32 33
 ```
--- a/wit/meaning_dataset.py
+++ b/wit/meaning_dataset.py
@ -352,7 +352,6 @@ class MeaningDataset(Dataset):
        output["labels"] = data.clone()
        output["token_type_ids"] = torch.zeros(data.shape)
        output["tree"] = [self.tree[i] for i in idx_list]
        output["level"] = [self.level[i] for i in idx_list]
        output["mask"] = self.get_seq_mask_tensor(idx_list)
        return output
--- a/wit/train.py
+++ b/wit/train.py
@ -24,19 +24,19 @@ dataloader_works = 2
 vocab_size = 256
 level_ratio = 5
 level = 5
-dataset_level = 1.5
+dataset_level = 3
 min_subitem = 2
 hidden_size = 128  # 128 1024 2048  32
 num_attention_heads = 16  # 8 8 16
 num_hidden_layers = 6  # 6 12 24  3
-mask_level = [0, 1]
+mask_level = [0, 1, 2]
-mask_idx = [0, -1]
+mask_idx = [0, 0, -1]
 # name = "vocab_ratio_level_data_hidden_head_layer"
 # name = "mask_level_idx"
-name = "hard"
+name = "bigger"
 ver = f"{vocab_size}" + "_" + f"{level_ratio}" + "_" + f"{level}" + "_" + f"{min_subitem}" + "_" + f"{dataset_level}"
 ver = ver + "_" + f"{hidden_size}" + "_" + f"{num_attention_heads}" + "_" + f"{num_hidden_layers}"
@ -56,7 +56,9 @@ if __name__ == "__main__":
    start = vocab_size * (level_ratio**level)
    size = vocab_size * int((level_ratio**dataset_level))
-    raw_dataset = MeaningDataset(start, start + size, size, vocab_size, level_ratio, min_subitem)
+
    raw_dataset = MeaningDataset(start, start + size, vocab_size, None, level_ratio, min_subitem)
    # print(raw_dataset.token_frequency())
    raw_dataset.set_mask(mask_level, mask_idx)
    train_dataset, val_dataset = raw_dataset.split(0.9)
    train_dataloader = BatchGroupMeaningDataloader(train_dataset, train_batch_size).dataloader(dataloader_works)