diff --git a/unsuper/minist.py b/unsuper/minist.py new file mode 100644 index 0000000..08d80ca --- /dev/null +++ b/unsuper/minist.py @@ -0,0 +1,93 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F # Add this line +import torchvision +import torchvision.transforms as transforms + + +# Device configuration +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Hyper-parameters +num_epochs = 5 +batch_size = 4 +learning_rate = 0.001 + +# Dataset has PILImage images of range [0, 1]. +# We transform them to Tensors of normalized range [-1, 1] +transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) + +# CIFAR10: 60000 32x32 color images in 10 classes, with 6000 images per class +train_dataset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform) + +test_dataset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform) + +train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) + +test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False) + + +class ConvNet(nn.Module): + def __init__(self): + super(ConvNet, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 5) + self.pool = nn.MaxPool2d(2, 2) + self.conv2 = nn.Conv2d(6, 16, 5) + self.fc1 = nn.Linear(16 * 5 * 5, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 5 * 5) + # x = F.relu(self.fc1(x)) + x = self.fc1(x) + # x = F.relu(self.fc2(x)) + x = self.fc2(x) + x = self.fc3(x) + return x + + +model = ConvNet().to(device) + +criterion = nn.CrossEntropyLoss() +optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) + +# Train the model +n_total_steps = len(train_loader) +for epoch in range(num_epochs): + for i, (images, labels) in enumerate(train_loader): + images = images.to(device) + labels = labels.to(device) + + # Forward pass + outputs = model(images) + loss = criterion(outputs, labels) + + # Backward and optimize + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if (i + 1) % 2000 == 0: + print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}") + +print("Finished Training") + +# Test the model +with torch.no_grad(): + n_correct = 0 + n_samples = 0 + for images, labels in test_loader: + images = images.to(device) + labels = labels.to(device) + outputs = model(images) + + # max returns (value ,index) + _, predicted = torch.max(outputs.data, 1) + n_samples += labels.size(0) + n_correct += (predicted == labels).sum().item() + + acc = 100.0 * n_correct / n_samples + print(f"Accuracy of the network on the 10000 test images: {acc} %") diff --git a/wit/doc/meaning_dataset.md b/wit/doc/meaning_dataset.md index 617bb97..e1ae483 100644 --- a/wit/doc/meaning_dataset.md +++ b/wit/doc/meaning_dataset.md @@ -10,12 +10,13 @@ meaning数据集是一个模仿自然语言,以及抽象表达的数据集。 4. 从0到(vocab_size-1)的编号表示基本meaning,是不能被拆解的,也就是token 5. meaning通过一层层的向低编号的meaning进行组合替换,最终形成一个最底层是token的树形数据 6. level表示当前token相对于root meaning的距离 -7. rank_idx表示当前token在不同层的排序编号,每4位表示在一层里面的编号,低4位表示最低层级的rank_idx,高位无用的位用1填充 -7. rank_all表示当前token在不同层的分子个数,每4位表示在一层里面的编号,低4位表示最低层级的rank_all,高位无用的位用1填充 -8. tree用于存储每个meaning的拆解的数据,使用字典表达一个树形结构 -9. get_seq_mask返回一个sequence每个token在对应level是不是对应的index,level=0:最底层,index=-1:最后一个,index=0:第一个 -10. meaning_height 当前meaning的总高度 -11. meaning_weight 当前meaning的总宽度 +7. rank +8. rank_idx表示当前token在不同层的排序编号,每4位表示在一层里面的编号,低4位表示最低层级的rank_idx,高位无用的位用1填充 +9. rank_all表示当前token在不同层的分子个数,每4位表示在一层里面的编号,低4位表示最低层级的rank_all,高位无用的位用1填充 +10. tree用于存储每个meaning的拆解的数据,使用字典表达一个树形结构 +11. get_seq_mask返回一个sequence每个token在对应level是不是对应的index,level=0:最底层,index=-1:最后一个,index=0:第一个 +12. meaning_height 当前meaning的总高度 +13. meaning_weight 当前meaning的总宽度 ``` @@ -31,10 +32,11 @@ vocab_size = 256 meaning = 115200 / \ / \ / \ / \ 176 11 255 129 129 99 211 111 -sequence = 123 42 32 176 11 255 129 245 233 129 99 23 211 111 93 176 -level = 3 3 2 4 4 4 4 2 2 4 4 3 4 4 3 3 -idx at 0 = 0 1 1 0 1 0 1 0 1 0 1 2 0 1 0 1 -idx at 1 = 0 0 0 0 0 1 1 1 1 0 0 0 0 0 2 2 -idx 0 1 1 0 1 16 17 16 17 0 1 2 0 1 32 33 +sequence = 123 42 32 176 11 255 129 245 233 129 99 23 211 111 93 176 +level = 3 3 2 4 4 4 4 2 2 4 4 3 4 4 3 3 +idx at 0 = 0 1 1 0 1 0 1 0 1 0 1 2 0 1 0 1 +idx at 1 = 0 0 0 0 0 1 1 1 1 0 0 0 0 0 2 2 +idx 0 1 1 0 1 16 17 16 17 0 1 2 0 1 32 33 + ``` diff --git a/wit/meaning_dataset.py b/wit/meaning_dataset.py index a702250..415febb 100644 --- a/wit/meaning_dataset.py +++ b/wit/meaning_dataset.py @@ -352,7 +352,6 @@ class MeaningDataset(Dataset): output["labels"] = data.clone() output["token_type_ids"] = torch.zeros(data.shape) output["tree"] = [self.tree[i] for i in idx_list] - output["level"] = [self.level[i] for i in idx_list] output["mask"] = self.get_seq_mask_tensor(idx_list) return output diff --git a/wit/train.py b/wit/train.py index 0202ad5..3e9716a 100644 --- a/wit/train.py +++ b/wit/train.py @@ -24,19 +24,19 @@ dataloader_works = 2 vocab_size = 256 level_ratio = 5 level = 5 -dataset_level = 1.5 +dataset_level = 3 min_subitem = 2 hidden_size = 128 # 128 1024 2048 32 num_attention_heads = 16 # 8 8 16 num_hidden_layers = 6 # 6 12 24 3 -mask_level = [0, 1] -mask_idx = [0, -1] +mask_level = [0, 1, 2] +mask_idx = [0, 0, -1] # name = "vocab_ratio_level_data_hidden_head_layer" # name = "mask_level_idx" -name = "hard" +name = "bigger" ver = f"{vocab_size}" + "_" + f"{level_ratio}" + "_" + f"{level}" + "_" + f"{min_subitem}" + "_" + f"{dataset_level}" ver = ver + "_" + f"{hidden_size}" + "_" + f"{num_attention_heads}" + "_" + f"{num_hidden_layers}" @@ -56,7 +56,9 @@ if __name__ == "__main__": start = vocab_size * (level_ratio**level) size = vocab_size * int((level_ratio**dataset_level)) - raw_dataset = MeaningDataset(start, start + size, size, vocab_size, level_ratio, min_subitem) + + raw_dataset = MeaningDataset(start, start + size, vocab_size, None, level_ratio, min_subitem) + # print(raw_dataset.token_frequency()) raw_dataset.set_mask(mask_level, mask_idx) train_dataset, val_dataset = raw_dataset.split(0.9) train_dataloader = BatchGroupMeaningDataloader(train_dataset, train_batch_size).dataloader(dataloader_works)