Skip to main content

Coding & Software

KernelBench

LLMs generate optimized CUDA kernels for 250 PyTorch ML workloads; response is per-task functional correctness.

250items
7subjects
99%observed
MITlicense
ml_engineeringdomain
software_engineeringdomain
textmodality

Response matrix

Fit to width. Hover for subject & item; click a cell for details.

This condition combination wasn’t evaluated — try a different attack, category, or judge.
Correct (1)Incorrect (0)Unobserved

Scale: 1 = correct · 0 = incorrect

Sample items

Item 10% solve rate

import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange

class Model(nn.Module): def init(self, batch_size, seq_length, n_heads, d_head, d_state, block_len=64): """ Mamba Structured State Space model implementation for benchmarking.

    :param batch_size: Size of the batch
    :param seq_length: Length of the input sequence
    :param n_heads: Number of attention heads
    :param d_head: Dimension of each head
    :param d_state: Dimension of the state space
    :param block_len: Length of each block for chunked computation
    """
    super(Model, self).__init__()
    
    assert seq_length % block_len == 0, "Sequence length must be divisible by block length"
    
    self.batch_size = batch_size
    self.seq_length = seq_length
    self.n_heads = n_heads
    self.d_head = d_head
    self.d_state = d_state
    self.block_len = block_len
    
    # Initialize parameters
    self.A = nn.Parameter(torch.randn(batch_size, seq_length, n_heads))
    self.B = nn.Parameter(torch.randn(batch_size, seq_length, n_heads, d_state))
    self.C = nn.Parameter(torch.randn(batch_size, seq_length, n_heads, d_state))
    
def segsum(self, x):
    """Naive segment sum calculation."""
    T = x.size(-1)
    x_cumsum = torch.cumsum(x, dim=-1)
    x_segsum = x_cumsum[..., :, None] - x_cumsum[..., None, :]
    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool), diagonal=0)
    x_segsum = x_segsum.masked_fill(~mask, -torch.inf)
    return x_segsum

def forward(self, X, initial_states=None):
    """
    Forward pass implementing the SSD operation.
    
    :param X: Input tensor of shape (batch, length, n_heads, d_head)
    :param initial_states: Optional initial states
    :return: Output tensor Y and final state
    """
    # Rearrange into blocks/chunks
    X_blocks, A_blocks, B_blocks, C_blocks = [
        rearrange(x, "b (c l) ... -> b c l ...", l=self.block_len)
        for x in (X, self.A, self.B, self.C)
    ]
    
    A_blocks = rearrange(A_blocks, "b c l h -> b h c l")
    A_cumsum = torch.cumsum(A_blocks, dim=-1)
    
    # 1. Compute diagonal block outputs
    L = torch.exp(self.segsum(A_blocks))
    Y_diag = torch.einsum("bclhn,bcshn,bhcls,bcshp->bclhp", 
                         C_blocks, B_blocks, L, X_blocks)
    
    # 2. Compute intra-chunk states
    decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum))
    states = torch.einsum("bclhn,bhcl,bclhp->bchpn", 
                        B_blocks, decay_states, X_blocks)
    
    # 3. Compute inter-chunk recurrence
    if initial_states is None:
        initial_states = torch.zeros_like(states[:, :1])
    states = torch.cat([initial_states, states], dim=1)
    
    decay_chunk = torch.exp(self.segsum(F.pad(A_cumsum[:, :, :, -1], (1, 0))))
    new_states = torch.einsum("bhzc,bchpn->bzhpn", decay_chunk, states)
    states = new_states[:, :-1]
    
    # 4. Compute state-to-output conversion
    state_decay_out = torch.exp(A_cumsum)
    Y_off = torch.einsum('bclhn,bchpn,bhcl->bclhp', 
                       C_blocks, states, state_decay_out)
    
    # Combine diagonal and off-diagonal terms
    Y = rearrange(Y_diag + Y_off, "b c l h p -> b (c l) h p")
    
    
    return Y

Test parameters

batch_size = 2048 seq_length = 128 n_heads = 8 d_head = 64 d_state = 16 block_len = 64

def get_inputs(): return [torch.rand(batch_size, seq_length, n_heads, d_head)]

def get_init_inputs(): return [batch_size, seq_length, n_heads, d_head, d_state, block_len]

Subject outcomes

  • Claude 3.5 Sonnet incorrect
  • DeepSeek-V3 incorrect
  • DeepSeek-R1 incorrect
  • Llama 3.1 405B Instruct incorrect
  • Llama 3.1 70B Instruct incorrect
  • OpenAI o1 incorrect
Item 21% solve rate

import torch import torch.nn as nn

class Model(nn.Module): """ Model that performs a GEMM, Group Normalization, Minimum operation, and Bias addition. """ def init(self, in_features, out_features, num_groups, bias_shape): super(Model, self).init() self.gemm = nn.Linear(in_features, out_features) self.group_norm = nn.GroupNorm(num_groups, out_features) self.bias = nn.Parameter(torch.randn(bias_shape))

def forward(self, x):
    x = self.gemm(x)
    x = self.group_norm(x)
    x = torch.min(x, dim=1, keepdim=True)[0] 
    x = x + self.bias
    return x

batch_size = 1024 in_features = 8192 out_features = 8192 num_groups = 512 bias_shape = (1, out_features, 1, 1)

def get_inputs(): return [torch.rand(batch_size, in_features)]

def get_init_inputs(): return [in_features, out_features, num_groups, bias_shape]

Subject outcomes

  • Claude 3.5 Sonnet incorrect
  • DeepSeek-V3 incorrect
  • DeepSeek-R1 incorrect
  • Llama 3.1 405B Instruct incorrect
  • Llama 3.1 70B Instruct incorrect
  • OpenAI o1 incorrect
Item 34% solve rate

import torch import torch.nn as nn

class Model(nn.Module): """ Model that performs a transposed convolution, applies GELU, and normalizes with GroupNorm. """ def init(self, in_channels, out_channels, kernel_size, stride, groups, num_groups): super(Model, self).init() self.conv_transpose = nn.ConvTranspose2d(in_channels, out_channels, kernel_size, stride=stride) self.group_norm = nn.GroupNorm(num_groups=num_groups, num_channels=out_channels)

def forward(self, x):
    x = self.conv_transpose(x)
    x = torch.nn.functional.gelu(x)
    x = self.group_norm(x)
    return x

batch_size = 128
in_channels = 64
out_channels = 64
height = width = 256
kernel_size = 3 stride = 1 groups = 8 num_groups = 8

def get_inputs(): return [torch.rand(batch_size, in_channels, height, width)]

def get_init_inputs(): return [in_channels, out_channels, kernel_size, stride, groups, num_groups]

Subject outcomes

  • DeepSeek-R1 correct
  • Claude 3.5 Sonnet incorrect
  • DeepSeek-V3 incorrect
  • GPT-4o incorrect
  • Llama 3.1 405B Instruct incorrect
  • Llama 3.1 70B Instruct incorrect
Item 48% solve rate

import torch import torch.nn as nn

class Model(nn.Module): """ Simple model that performs a convolution, divides by a constant, and applies LeakyReLU. """ def init(self, in_channels, out_channels, kernel_size, divisor): super(Model, self).init() self.conv = nn.Conv2d(in_channels, out_channels, kernel_size) self.divisor = divisor

def forward(self, x):
    x = self.conv(x)
    x = x / self.divisor
    x = torch.nn.functional.leaky_relu(x, negative_slope=0.01)
    return x

batch_size = 128 in_channels = 8 out_channels = 64 height, width = 128, 128 kernel_size = 3 divisor = 2

def get_inputs(): return [torch.rand(batch_size, in_channels, height, width)]

def get_init_inputs(): return [in_channels, out_channels, kernel_size, divisor]

Subject outcomes

  • DeepSeek-R1 correct
  • OpenAI o1 correct
  • Claude 3.5 Sonnet incorrect
  • GPT-4o incorrect
  • Llama 3.1 405B Instruct incorrect
  • Llama 3.1 70B Instruct incorrect
Item 59% solve rate

import torch import torch.nn as nn

class Model(nn.Module): """ A 3D convolutional transpose layer followed by Batch Normalization and subtraction. """ def init(self, in_channels, out_channels, kernel_size, stride, padding, bias=True): super(Model, self).init() self.conv_transpose = nn.ConvTranspose3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=bias) self.batch_norm = nn.BatchNorm3d(out_channels)

def forward(self, x):
    x = self.conv_transpose(x)
    x = self.batch_norm(x)
    x = x - torch.mean(x, dim=(2, 3, 4), keepdim=True)  # Subtract mean along spatial dimensions
    return x

batch_size = 16 in_channels = 16 out_channels = 32 depth, height, width = 16, 32, 32 kernel_size = 3 stride = 2 padding = 1

def get_inputs(): return [torch.rand(batch_size, in_channels, depth, height, width)]

def get_init_inputs(): return [in_channels, out_channels, kernel_size, stride, padding]

Subject outcomes

  • Claude 3.5 Sonnet incorrect
  • DeepSeek-V3 incorrect
  • DeepSeek-R1 incorrect
  • Llama 3.1 405B Instruct incorrect
  • Llama 3.1 70B Instruct incorrect
  • OpenAI o1 incorrect
Item 611% solve rate

import torch import torch.nn as nn

class Model(nn.Module): """ Model that performs a convolution, scales the output, and then applies a minimum operation. """ def init(self, in_channels, out_channels, kernel_size, scale_factor): super(Model, self).init() self.conv = nn.Conv2d(in_channels, out_channels, kernel_size) self.scale_factor = scale_factor

def forward(self, x):
    """
    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
    Returns:
        torch.Tensor: Output tensor of shape (batch_size, out_channels, height, width).
    """
    x = self.conv(x)
    x = x * self.scale_factor
    x = torch.min(x, dim=1, keepdim=True)[0]  # Minimum along channel dimension
    return x

batch_size = 64 in_channels = 64 out_channels = 128 height = width = 256 kernel_size = 3 scale_factor = 2.0

def get_inputs(): return [torch.rand(batch_size, in_channels, height, width)]

def get_init_inputs(): return [in_channels, out_channels, kernel_size, scale_factor]

Subject outcomes

  • DeepSeek-R1 correct
  • OpenAI o1 correct
  • Claude 3.5 Sonnet incorrect
  • GPT-4o incorrect
  • Llama 3.1 405B Instruct incorrect
  • Llama 3.1 70B Instruct incorrect
Item 713% solve rate

import torch import torch.nn as nn

class Model(nn.Module): """ Model that performs a 3D convolution, max pooling, log sum exp, and ReLU activation. """ def init(self, in_channels, out_channels, kernel_size, stride, padding): super(Model, self).init() self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding) self.max_pool = nn.MaxPool3d(kernel_size=2, stride=2)

def forward(self, x):
    """
    Args:
        x: Input tensor of shape (batch_size, in_channels, depth, height, width)
    Returns:
        Output tensor of shape (batch_size, out_channels, depth', height', width')
    """
    x = self.conv(x)
    x = self.max_pool(x)
    x = torch.logsumexp(x, dim=1, keepdim=True)
    x = torch.relu(x)
    return x

batch_size = 4 in_channels = 32 out_channels = 64 depth, height, width = 32, 128, 128 kernel_size = 3 stride = 1 padding = 1

def get_inputs(): return [torch.rand(batch_size, in_channels, depth, height, width)]

def get_init_inputs(): return [in_channels, out_channels, kernel_size, stride, padding]

Subject outcomes

  • DeepSeek-R1 correct
  • OpenAI o1 correct
  • Claude 3.5 Sonnet incorrect
  • GPT-4o incorrect
  • Llama 3.1 405B Instruct incorrect
  • Llama 3.1 70B Instruct incorrect
Item 815% solve rate

import torch import torch.nn as nn

class Model(nn.Module): """ Simple model that performs a single matrix multiplication (C = A * B) """ def init(self): super(Model, self).init()

def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
    """
    Performs matrix multiplication.

    Args:
        A: Input tensor of shape (M, K).
        B: Input tensor of shape (K, N).

    Returns:
        Output tensor of shape (M, N).
    """
    return torch.matmul(A.T, B.T)

M = 1024 * 2 K = 4096 * 2 N = 2048 * 2

def get_inputs(): A = torch.rand(K, M) B = torch.rand(N, K) return [A, B]

def get_init_inputs(): return [] # No special initialization inputs needed

Subject outcomes

  • DeepSeek-R1 correct
  • OpenAI o1 correct
  • GPT-4o correct
  • Claude 3.5 Sonnet incorrect
  • Llama 3.1 405B Instruct incorrect
  • Llama 3.1 70B Instruct incorrect
Item 919% solve rate

import torch import torch.nn as nn

class Model(nn.Module): """ Model that performs a convolution, applies minimum operation, Tanh, and another Tanh. """ def init(self, in_channels, out_channels, kernel_size): super(Model, self).init() self.conv = nn.Conv2d(in_channels, out_channels, kernel_size)

def forward(self, x):
    x = self.conv(x)
    x = torch.min(x, dim=1, keepdim=True)[0] # Apply minimum operation along the channel dimension
    x = torch.tanh(x)
    x = torch.tanh(x)
    return x

batch_size = 128 in_channels = 16 out_channels = 64 height = width = 256 kernel_size = 3

def get_inputs(): return [torch.rand(batch_size, in_channels, height, width)]

def get_init_inputs(): return [in_channels, out_channels, kernel_size]

Subject outcomes

  • DeepSeek-R1 correct
  • OpenAI o1 correct
  • Claude 3.5 Sonnet incorrect
  • GPT-4o incorrect
  • Llama 3.1 405B Instruct incorrect
  • Llama 3.1 70B Instruct incorrect
Item 1027% solve rate

import torch import torch.nn as nn

class Model(nn.Module): """ Model that performs a 3D convolution, applies Group Normalization, computes the mean """ def init(self, in_channels, out_channels, kernel_size, num_groups): super(Model, self).init() self.conv = nn.Conv3d(in_channels, out_channels, kernel_size) self.group_norm = nn.GroupNorm(num_groups, out_channels)

def forward(self, x):
    """
    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, in_channels, D, H, W).
    Returns:
        torch.Tensor: Output tensor of shape (batch_size, 1).
    """
    x = self.conv(x)
    x = self.group_norm(x)
    x = x.mean(dim=[1, 2, 3, 4]) # Compute mean across all dimensions except batch
    return x

batch_size = 128 in_channels = 3 out_channels = 24 D, H, W = 24, 32, 32 kernel_size = 3 num_groups = 8

def get_inputs(): return [torch.rand(batch_size, in_channels, D, H, W)]

def get_init_inputs(): return [in_channels, out_channels, kernel_size, num_groups]

Subject outcomes

  • OpenAI o1 correct
  • DeepSeek-V3 incorrect
  • DeepSeek-R1 incorrect
  • GPT-4o incorrect
  • Llama 3.1 405B Instruct incorrect
  • Llama 3.1 70B Instruct incorrect
Item 1137% solve rate

import torch import torch.nn as nn

class Model(nn.Module): """ Simple model that performs a gemm, swish, divide, clamp, tanh, and clamp operations. """ def init(self, in_features, out_features, bias=True): super(Model, self).init() self.gemm = nn.Linear(in_features, out_features, bias=bias)

def forward(self, x):
    """
    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, in_features).
    Returns:
        torch.Tensor: Output tensor of shape (batch_size, out_features).
    """
    x = self.gemm(x)
    x = x * torch.sigmoid(x)  # Swish activation
    x = x / 2.0
    x = torch.clamp(x, min=-1.0, max=1.0)  # Clamp between -1 and 1
    x = torch.tanh(x)  # Tanh activation
    x = torch.clamp(x, min=-1.0, max=1.0)  # Clamp between -1 and 1
    return x

batch_size = 1024 in_features = 8192 out_features = 8192

def get_inputs(): return [torch.rand(batch_size, in_features)]

def get_init_inputs(): return [in_features, out_features]

Subject outcomes

  • DeepSeek-R1 correct
  • Claude 3.5 Sonnet incorrect
  • DeepSeek-V3 incorrect
  • Llama 3.1 405B Instruct incorrect
  • Llama 3.1 70B Instruct incorrect
  • OpenAI o1 incorrect
Item 1243% solve rate

import torch import torch.nn as nn

class Model(nn.Module): """ Simple model that performs Instance Normalization. """ def init(self, num_features: int): """ Initializes the InstanceNorm layer.

    Args:
        num_features (int): Number of features in the input tensor.
    """
    super(Model, self).__init__()
    self.inorm = nn.InstanceNorm2d(num_features=num_features)

def forward(self, x: torch.Tensor) -> torch.Tensor:
    """
    Applies Instance Normalization to the input tensor.

    Args:
        x (torch.Tensor): Input tensor of shape (batch_size, num_features, height, width).

    Returns:
        torch.Tensor: Output tensor with Instance Normalization applied, same shape as input.
    """
    return self.inorm(x)

batch_size = 112 # heavier workload features = 64 dim1 = 512 dim2 = 512

def get_inputs(): x = torch.rand(batch_size, features, dim1, dim2) return [x]

def get_init_inputs(): return [features]

Subject outcomes

  • Claude 3.5 Sonnet correct
  • GPT-4o correct
  • DeepSeek-V3 correct
  • Llama 3.1 405B Instruct correct
  • DeepSeek-R1 incorrect
  • Llama 3.1 70B Instruct incorrect

Subjects

  1. 1OpenAI o10.5816
  2. 2DeepSeek-R10.2845
  3. 3Claude 3.5 Sonnet0.2805
  4. 4DeepSeek-V30.2439
  5. 5GPT-4o0.2227
  6. 6Llama 3.1 405B Instruct0.168
  7. 7Llama 3.1 70B Instruct0.076