NVIDIA cuTile Python Tutorial: Building Tiled GPU Kernels for Vector Addition, Matrix Addition, and Matrix Multiplication in Colab

print("\n" + "=" * 90)
print("[5] cuTile kernels are defined only if cuda.tile imports successfully")
print("=" * 90)
if cutile_import_ok:
   ConstInt = ct.Constant[int]
   @ct.kernel
   def cutile_vec_add_direct_kernel(a, b, c, TILE: ConstInt):
       bid = ct.bid(0)
       a_tile = ct.load(a, index=(bid,), shape=(TILE,))
       b_tile = ct.load(b, index=(bid,), shape=(TILE,))
       c_tile = a_tile + b_tile
       ct.store(c, index=(bid,), tile=c_tile)
   @ct.kernel
   def cutile_vec_add_gather_kernel(a, b, c, TILE: ConstInt):
       bid = ct.bid(0)
       offsets = bid * TILE + ct.arange(TILE, dtype=torch.int32)
       a_tile = ct.gather(a, offsets)
       b_tile = ct.gather(b, offsets)
       c_tile = a_tile + b_tile
       ct.scatter(c, offsets, c_tile)
   @ct.kernel
   def cutile_matrix_add_gather_kernel(a, b, c, TILE_M: ConstInt, TILE_N: ConstInt):
       bid_m = ct.bid(0)
       bid_n = ct.bid(1)
       rows = bid_m * TILE_M + ct.arange(TILE_M, dtype=torch.int32)
       cols = bid_n * TILE_N + ct.arange(TILE_N, dtype=torch.int32)
       rows = rows[:, None]
       cols = cols[None, :]
       a_tile = ct.gather(a, (rows, cols))
       b_tile = ct.gather(b, (rows, cols))
       c_tile = a_tile + b_tile
       ct.scatter(c, (rows, cols), c_tile)
   @ct.kernel
   def cutile_matmul_kernel(A, B, C, TM: ConstInt, TN: ConstInt, TK: ConstInt):
       bid_m = ct.bid(0)
       bid_n = ct.bid(1)
       num_tiles_k = ct.num_tiles(A, axis=1, shape=(TM, TK))
       acc = ct.full((TM, TN), 0, dtype=ct.float32)
       zero_pad = ct.PaddingMode.ZERO
       compute_dtype = ct.tfloat32 if A.dtype == ct.float32 else A.dtype
       for k in range(num_tiles_k):
           a_tile = ct.load(
               A,
               index=(bid_m, k),
               shape=(TM, TK),
               padding_mode=zero_pad
           ).astype(compute_dtype)
           b_tile = ct.load(
               B,
               index=(k, bid_n),
               shape=(TK, TN),
               padding_mode=zero_pad
           ).astype(compute_dtype)
           acc = ct.mma(a_tile, b_tile, acc)
       out = ct.astype(acc, C.dtype)
       ct.store(C, index=(bid_m, bid_n), tile=out)
else:
   print("Skipping cuTile kernel definitions because cuda.tile is unavailable.")
print("\n" + "=" * 90)
print("[6] High-level wrappers")
print("=" * 90)
def vec_add_tutorial(a, b, use_gather=True):
   if a.shape != b.shape:
   if likely_runtime_ok and a.is_cuda:
       c = torch.empty_like(a)
       TILE = 256 if use_gather else min(1024, 2 ** math.ceil(math.log2(a.numel())))
       grid = (math.ceil(a.numel() / TILE), 1, 1)
       kernel = cutile_vec_add_gather_kernel if use_gather else cutile_vec_add_direct_kernel
       ct.launch(torch.cuda.current_stream(), grid, kernel, (a, b, c, TILE))
       return c
   return a + b
def matrix_add_tutorial(a, b):
   if a.shape != b.shape:
   if likely_runtime_ok and a.is_cuda:
       c = torch.empty_like(a)
       TILE_M = 16
       TILE_N = 64
       grid = (math.ceil(a.shape[0] / TILE_M), math.ceil(a.shape[1] / TILE_N), 1)
       ct.launch(
           torch.cuda.current_stream(),
           grid,
           cutile_matrix_add_gather_kernel,
           (a, b, c, TILE_M, TILE_N)
       )
       return c
   return a + b
def matmul_tutorial(A, B):
   if A.shape[1] != B.shape[0]:
       raise ValueError("A.shape[1] must equal B.shape[0]")
   if likely_runtime_ok and A.is_cuda:
       if A.dtype in (torch.float16, torch.bfloat16):
           TM, TN, TK = 128, 128, 64
       else:
           TM, TN, TK = 32, 32, 32
       C = torch.empty((A.shape[0], B.shape[1]), device=A.device, dtype=A.dtype)
       grid = (math.ceil(A.shape[0] / TM), math.ceil(B.shape[1] / TN), 1)
       ct.launch(
           torch.cuda.current_stream(),
           grid,
           cutile_matmul_kernel,
           (A, B, C, TM, TN, TK)
       )
       return C
   return A @ B
print("Wrappers ready.")
print(f"Execution backend: {'cuTile' if likely_runtime_ok else 'PyTorch fallback'}")