diff --git a/unsuper/minist.py b/unsuper/minist.py
new file mode 100644
index 0000000..08d80ca
--- /dev/null
+++ b/unsuper/minist.py
@@ -0,0 +1,93 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # Add this line
+import torchvision
+import torchvision.transforms as transforms
+
+
+# Device configuration
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Hyper-parameters
+num_epochs = 5
+batch_size = 4
+learning_rate = 0.001
+
+# Dataset has PILImage images of range [0, 1].
+# We transform them to Tensors of normalized range [-1, 1]
+transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+# CIFAR10: 60000 32x32 color images in 10 classes, with 6000 images per class
+train_dataset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
+
+test_dataset = torchvision.datasets.CIFAR10(root="./data", train=False, download=True, transform=transform)
+
+train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+
+test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+
+
+class ConvNet(nn.Module):
+    def __init__(self):
+        super(ConvNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        # x = F.relu(self.fc1(x))
+        x = self.fc1(x)
+        # x = F.relu(self.fc2(x))
+        x = self.fc2(x)
+        x = self.fc3(x)
+        return x
+
+
+model = ConvNet().to(device)
+
+criterion = nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+# Train the model
+n_total_steps = len(train_loader)
+for epoch in range(num_epochs):
+    for i, (images, labels) in enumerate(train_loader):
+        images = images.to(device)
+        labels = labels.to(device)
+
+        # Forward pass
+        outputs = model(images)
+        loss = criterion(outputs, labels)
+
+        # Backward and optimize
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        if (i + 1) % 2000 == 0:
+            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}")
+
+print("Finished Training")
+
+# Test the model
+with torch.no_grad():
+    n_correct = 0
+    n_samples = 0
+    for images, labels in test_loader:
+        images = images.to(device)
+        labels = labels.to(device)
+        outputs = model(images)
+
+        # max returns (value ,index)
+        _, predicted = torch.max(outputs.data, 1)
+        n_samples += labels.size(0)
+        n_correct += (predicted == labels).sum().item()
+
+    acc = 100.0 * n_correct / n_samples
+    print(f"Accuracy of the network on the 10000 test images: {acc} %")
diff --git a/wit/doc/meaning_dataset.md b/wit/doc/meaning_dataset.md
index 617bb97..e1ae483 100644
--- a/wit/doc/meaning_dataset.md
+++ b/wit/doc/meaning_dataset.md
@@ -10,12 +10,13 @@ meaning数据集是一个模仿自然语言，以及抽象表达的数据集。
 4. 从0到(vocab_size-1)的编号表示基本meaning，是不能被拆解的，也就是token
 5. meaning通过一层层的向低编号的meaning进行组合替换，最终形成一个最底层是token的树形数据
 6. level表示当前token相对于root meaning的距离
-7. rank_idx表示当前token在不同层的排序编号，每4位表示在一层里面的编号，低4位表示最低层级的rank_idx，高位无用的位用1填充
-7. rank_all表示当前token在不同层的分子个数，每4位表示在一层里面的编号，低4位表示最低层级的rank_all，高位无用的位用1填充
-8. tree用于存储每个meaning的拆解的数据，使用字典表达一个树形结构
-9. get_seq_mask返回一个sequence每个token在对应level是不是对应的index,level=0:最底层，index=-1:最后一个，index=0:第一个
-10. meaning_height 当前meaning的总高度
-11. meaning_weight 当前meaning的总宽度
+7. rank
+8. rank_idx表示当前token在不同层的排序编号，每4位表示在一层里面的编号，低4位表示最低层级的rank_idx，高位无用的位用1填充
+9. rank_all表示当前token在不同层的分子个数，每4位表示在一层里面的编号，低4位表示最低层级的rank_all，高位无用的位用1填充
+10. tree用于存储每个meaning的拆解的数据，使用字典表达一个树形结构
+11. get_seq_mask返回一个sequence每个token在对应level是不是对应的index,level=0:最底层，index=-1:最后一个，index=0:第一个
+12. meaning_height 当前meaning的总高度
+13. meaning_weight 当前meaning的总宽度
 
 
 ```
@@ -31,10 +32,11 @@ vocab_size = 256 meaning = 115200
               / \     / \        / \       / \
             176 11  255 129    129  99   211 111
 
-sequence = 123 42 32 176 11 255 129 245 233 129 99 23 211 111 93 176
-level    =  3   3  2  4   4  4   4   2   2   4   4  3  4   4   3  3
-idx at 0 =  0   1  1  0   1  0   1   0   1   0   1  2  0   1   0  1
-idx at 1 =  0   0  0  0   0  1   1   1   1   0   0  0  0   0   2  2
-idx         0   1  1  0   1 16  17  16  17   0   1  2  0   1  32 33
+sequence = 123  42  32 176  11 255 129 245 233 129  99  23 211 111  93 176
+level    =  3    3   2  4    4  4   4   2   2   4    4   3  4   4    3  3
+idx at 0 =  0    1   1  0    1  0   1   0   1   0    1   2  0   1    0  1
+idx at 1 =  0    0   0  0    0  1   1   1   1   0    0   0  0   0    2  2
+idx         0    1   1  0    1 16  17  16  17   0    1   2  0   1   32 33
+
 
 ```
diff --git a/wit/meaning_dataset.py b/wit/meaning_dataset.py
index a702250..415febb 100644
--- a/wit/meaning_dataset.py
+++ b/wit/meaning_dataset.py
@@ -352,7 +352,6 @@ class MeaningDataset(Dataset):
         output["labels"] = data.clone()
         output["token_type_ids"] = torch.zeros(data.shape)
         output["tree"] = [self.tree[i] for i in idx_list]
-        output["level"] = [self.level[i] for i in idx_list]
         output["mask"] = self.get_seq_mask_tensor(idx_list)
         return output
 
diff --git a/wit/train.py b/wit/train.py
index 0202ad5..3e9716a 100644
--- a/wit/train.py
+++ b/wit/train.py
@@ -24,19 +24,19 @@ dataloader_works = 2
 vocab_size = 256
 level_ratio = 5
 level = 5
-dataset_level = 1.5
+dataset_level = 3
 min_subitem = 2
 
 hidden_size = 128  # 128 1024 2048  32
 num_attention_heads = 16  # 8 8 16
 num_hidden_layers = 6  # 6 12 24  3
 
-mask_level = [0, 1]
-mask_idx = [0, -1]
+mask_level = [0, 1, 2]
+mask_idx = [0, 0, -1]
 
 # name = "vocab_ratio_level_data_hidden_head_layer"
 # name = "mask_level_idx"
-name = "hard"
+name = "bigger"
 
 ver = f"{vocab_size}" + "_" + f"{level_ratio}" + "_" + f"{level}" + "_" + f"{min_subitem}" + "_" + f"{dataset_level}"
 ver = ver + "_" + f"{hidden_size}" + "_" + f"{num_attention_heads}" + "_" + f"{num_hidden_layers}"
@@ -56,7 +56,9 @@ if __name__ == "__main__":
 
     start = vocab_size * (level_ratio**level)
     size = vocab_size * int((level_ratio**dataset_level))
-    raw_dataset = MeaningDataset(start, start + size, size, vocab_size, level_ratio, min_subitem)
+
+    raw_dataset = MeaningDataset(start, start + size, vocab_size, None, level_ratio, min_subitem)
+    # print(raw_dataset.token_frequency())
     raw_dataset.set_mask(mask_level, mask_idx)
     train_dataset, val_dataset = raw_dataset.split(0.9)
     train_dataloader = BatchGroupMeaningDataloader(train_dataset, train_batch_size).dataloader(dataloader_works)