Witllm/binary/cudagraph.py

import torch

# 1. 定义模型（需满足静态形状和静态控制流）
class SimpleModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(3 * 224 * 224, 1000)
    
    def forward(self, x):
        x = x.view(x.size(0), -1)  # 静态形状操作
        return torch.relu(self.fc(x))  # 避免动态控制流

model = SimpleModel().cuda()

# 2. 准备静态输入/输出占位张量
static_input = torch.randn(16, 3, 224, 224, device='cuda')
static_output = torch.zeros(16, 1000, device='cuda')

# 3. 预热阶段（必须在非默认流）
s = torch.cuda.Stream()
s.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(s):
    for _ in range(3):  # 预热3次
        static_output = model(static_input)
torch.cuda.current_stream().wait_stream(s)

# 4. 创建并捕获CUDA图
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
    # 注意：此处操作不会实际执行，仅记录计算图
    static_output = model(static_input)

# 5. 使用图计算（更新数据+重放）
def run_graph(new_input):
    # 将新数据复制到捕获的输入张量
    static_input.copy_(new_input)
    # 重放计算图
    g.replay()
    return static_output.clone()  # 返回结果副本

# 测试
new_data = torch.randn(16, 3, 224, 224, device='cuda')
result = run_graph(new_data)
print(result.shape)  # torch.Size([16, 1000])
Refine mnist LUT by add lutCNN. 2025-05-27 18:51:07 +08:00			`import torch`

			`# 1. 定义模型（需满足静态形状和静态控制流）`
			`class SimpleModel(torch.nn.Module):`
			`def __init__(self):`
			`super().__init__()`
			`self.fc = torch.nn.Linear(3 * 224 * 224, 1000)`

			`def forward(self, x):`
			`x = x.view(x.size(0), -1) # 静态形状操作`
			`return torch.relu(self.fc(x)) # 避免动态控制流`

			`model = SimpleModel().cuda()`

			`# 2. 准备静态输入/输出占位张量`
			`static_input = torch.randn(16, 3, 224, 224, device='cuda')`
			`static_output = torch.zeros(16, 1000, device='cuda')`

			`# 3. 预热阶段（必须在非默认流）`
			`s = torch.cuda.Stream()`
			`s.wait_stream(torch.cuda.current_stream())`
			`with torch.cuda.stream(s):`
			`for _ in range(3): # 预热3次`
			`static_output = model(static_input)`
			`torch.cuda.current_stream().wait_stream(s)`

			`# 4. 创建并捕获CUDA图`
			`g = torch.cuda.CUDAGraph()`
			`with torch.cuda.graph(g):`
			`# 注意：此处操作不会实际执行，仅记录计算图`
			`static_output = model(static_input)`

			`# 5. 使用图计算（更新数据+重放）`
			`def run_graph(new_input):`
			`# 将新数据复制到捕获的输入张量`
			`static_input.copy_(new_input)`
			`# 重放计算图`
			`g.replay()`
			`return static_output.clone() # 返回结果副本`

			`# 测试`
			`new_data = torch.randn(16, 3, 224, 224, device='cuda')`
			`result = run_graph(new_data)`
			`print(result.shape) # torch.Size([16, 1000])`