NVIDIA cuTile Python Tutorial: Building Tiled GPU Kernels for Vector Addition, Matrix Addition, and Matrix Multiplication in Colab

print(“\n” + “=” * 90)
print(“[5] cuTile kernels are defined only if cuda.tile imports successfully”)
print(“=” * 90)
if cutile_import_ok:
ConstInt = ct.Constant[int]
@ct.kernel
def cutile_vec_add_direct_kernel(a, b, c, TILE: ConstInt):
bid = ct.bid(0)
a_tile = ct.load(a, index=(bid,), shape=(TILE,))
b_tile = ct.load(b, index=(bid,), shape=(TILE,))
c_tile = a_tile + b_tile
ct.store(c, index=(bid,), tile=c_tile)
@ct.kernel
def cutile_vec_add_gather_kernel(a, b, c, TILE: ConstInt):
bid = ct.bid(0)
offsets = bid * TILE + ct.arange(TILE, dtype=torch.int32)
a_tile = ct.gather(a, offsets)
b_tile = ct.gather(b, offsets)
c_tile = a_tile + b_tile
ct.scatter(c, offsets, c_tile)
@ct.kernel
def cutile_matrix_add_gather_kernel(a, b, c, TILE_M: ConstInt, TILE_N: ConstInt):
bid_m = ct.bid(0)
bid_n = ct.bid(1)
rows = bid_m * TILE_M + ct.arange(TILE_M, dtype=torch.int32)
cols = bid_n * TILE_N + ct.arange(TILE_N, dtype=torch.int32)
rows = rows[:, None]
cols = cols[None, :]
a_tile = ct.gather(a, (rows, cols))
b_tile = ct.gather(b, (rows, cols))
c_tile = a_tile + b_tile
ct.scatter(c, (rows, cols), c_tile)
@ct.kernel
def cutile_matmul_kernel(A, B, C, TM: ConstInt, TN: ConstInt, TK: ConstInt):
bid_m = ct.bid(0)
bid_n = ct.bid(1)
num_tiles_k = ct.num_tiles(A, axis=1, shape=(TM, TK))
acc = ct.full((TM, TN), 0, dtype=ct.float32)
zero_pad = ct.PaddingMode.ZERO
compute_dtype = ct.tfloat32 if A.dtype == ct.float32 else A.dtype
for k in range(num_tiles_k):
a_tile = ct.load(
A,
index=(bid_m, k),
shape=(TM, TK),
padding_mode=zero_pad
).astype(compute_dtype)
b_tile = ct.load(
B,
index=(k, bid_n),
shape=(TK, TN),
padding_mode=zero_pad
).astype(compute_dtype)
acc = ct.mma(a_tile, b_tile, acc)
out = ct.astype(acc, C.dtype)
ct.store(C, index=(bid_m, bid_n), tile=out)
else:
print(“Skipping cuTile kernel definitions because cuda.tile is unavailable.”)
print(“\n” + “=” * 90)
print(“[6] High-level wrappers”)
print(“=” * 90)
def vec_add_tutorial(a, b, use_gather=True):
if a.shape != b.shape:
if likely_runtime_ok and a.is_cuda:
c = torch.empty_like(a)
TILE = 256 if use_gather else min(1024, 2 ** math.ceil(math.log2(a.numel())))
grid = (math.ceil(a.numel() / TILE), 1, 1)
kernel = cutile_vec_add_gather_kernel if use_gather else cutile_vec_add_direct_kernel
ct.launch(torch.cuda.current_stream(), grid, kernel, (a, b, c, TILE))
return c
return a + b
def matrix_add_tutorial(a, b):
if a.shape != b.shape:
if likely_runtime_ok and a.is_cuda:
c = torch.empty_like(a)
TILE_M = 16
TILE_N = 64
grid = (math.ceil(a.shape[0] / TILE_M), math.ceil(a.shape[1] / TILE_N), 1)
ct.launch(
torch.cuda.current_stream(),
grid,
cutile_matrix_add_gather_kernel,
(a, b, c, TILE_M, TILE_N)
)
return c
return a + b
def matmul_tutorial(A, B):
if A.shape[1] != B.shape[0]:
raise ValueError(“A.shape[1] must equal B.shape[0]”)
if likely_runtime_ok and A.is_cuda:
if A.dtype in (torch.float16, torch.bfloat16):
TM, TN, TK = 128, 128, 64
else:
TM, TN, TK = 32, 32, 32
C = torch.empty((A.shape[0], B.shape[1]), device=A.device, dtype=A.dtype)
grid = (math.ceil(A.shape[0] / TM), math.ceil(B.shape[1] / TN), 1)
ct.launch(
torch.cuda.current_stream(),
grid,
cutile_matmul_kernel,
(A, B, C, TM, TN, TK)
)
return C
return A @ B
print(“Wrappers ready.”)
print(f”Execution backend: {‘cuTile’ if likely_runtime_ok else ‘PyTorch fallback’}”)

Source link

NVIDIA cuTile Python Tutorial: Building Tiled GPU Kernels for Vector Addition, Matrix Addition, and Matrix Multiplication in Colab

Working to automate nuclear plant operations | MIT News

Microsoft launches new in-house AI models it says cut costs up to 89% versus OpenAI

Nvidia bets physical AI can solve healthcare robotics’ data problem

Unsloth vs Axolotl vs TRL vs LLaMA-Factory: A Fine-Tuning Framework Comparison on Speed, VRAM, and Multi-GPU

Ripple Launches Mint for Institutional RLUSD Access

Dango Blockchain to Shut Down, Halt Perp DEX Trading

U.S. Spot Bitcoin, Ethereum ETFs Post Net Outflows as Institutional Demand Cools

Why Wall Street Can’t Get Enough of This Dividend King

After Twenty One Exit, Jack Mallers Says Bitcoin Taught Him Hard Lessons

Top Insights

TRON Reaches $90B+ USDT on the Network, $887M+ in Crypto Card Volume in Q2, CoinDesk Data and CryptoQuant

Working to automate nuclear plant operations | MIT News

NVIDIA cuTile Python Tutorial: Building Tiled GPU Kernels for Vector Addition, Matrix Addition, and Matrix Multiplication in Colab

Related Posts