Documentation
Learn how to write GPU kernels in Python with locomp.
Installation
pip install locompPython 3.10+. No Xcode, no nvcc, no toolchain setup required.
Apple Silicon
macOS, M1/M2/M3/M4
NVIDIA CUDA
Linux/Windows, CUDA 11+
RISC-V RVV
Linux + gcc-riscv64-linux-gnu
Quick Start
vector_add.py
import locomp
import numpy as np
@locomp.kernel
def vector_add(X: locomp.Tensor, Y: locomp.Tensor, O: locomp.Tensor,
N: locomp.constexpr):
i = locomp.program_id(0)
locomp.store(O + i, locomp.load(X + i) + locomp.load(Y + i))
# Apple Silicon (Metal)
x = locomp.tensor(np.ones(4, dtype=np.float32))
y = locomp.tensor(np.ones(4, dtype=np.float32) * 2)
o = locomp.empty(4)
vector_add[(4,)](x, y, o, N=4)
print(o.numpy()) # [3. 3. 3. 3.]
# NVIDIA GPU (CUDA)
x = locomp.tensor(np.ones(4, dtype=np.float32), backend="cuda")
y = locomp.tensor(np.ones(4, dtype=np.float32) * 2, backend="cuda")
o = locomp.empty(4, backend="cuda")
vector_add[(4,)](x, y, o, N=4)
print(o.numpy()) # [3. 3. 3. 3.]N: locomp.constexpr is inlined as a compile-time literal. Compiled pipeline cached per constexpr config.
Dispatch Model
# 1D grid — N threadgroups, 1 thread each
kernel[(N,)](args...)
# 1D grid with threadgroup size — N groups, T threads each
kernel[(N,), (T,)](args...)
# 2D grid
kernel[(gx, gy)](args...)
# 2D grid + 2D threadgroup
kernel[(gx, gy), (tx, ty)](args...)Use locomp.program_id(axis) for threadgroup index, locomp.local_id(axis) for thread index within group.
SIMD Reductions
local_sum = locomp.simd_sum(val) # Sum across 32-lane SIMD group
local_max = locomp.simd_max(val) # Max across SIMD group
lane_id = locomp.simd_lane_id() # Lane index [0, 31]
group_id = locomp.simd_group_id() # SIMD group index within threadgroupOn Apple Silicon, a SIMD group is 32 threads (equivalent to a CUDA warp).
Auto-Tuning
from locomp import autotune, Config
@autotune(
configs=[
Config(BLOCK_M=16, BLOCK_N=16),
Config(BLOCK_M=32, BLOCK_N=32),
Config(BLOCK_M=64, BLOCK_N=64),
],
key=["M", "N", "K"],
)
@locomp.kernel
def tuned_matmul(A: locomp.Tensor, B: locomp.Tensor, C: locomp.Tensor,
M: locomp.constexpr, N: locomp.constexpr, K: locomp.constexpr,
BLOCK_M: locomp.constexpr, BLOCK_N: locomp.constexpr):
passResults cached to ~/.cache/locomp/autotune.json. Call locomp.clear_cache() to reset.
Float16 & Quantization
@locomp.kernel
def fp16_add(X: locomp.Tensor, Y: locomp.Tensor, O: locomp.Tensor,
N: locomp.constexpr):
i = locomp.program_id(0)
x = locomp.cast(locomp.load(X + i), locomp.Float16)
y = locomp.cast(locomp.load(Y + i), locomp.Float16)
locomp.store(O + i, locomp.cast(x + y, locomp.Float32))Types: locomp.Float16, locomp.BFloat16, locomp.Int8, locomp.UInt8
63 Kernel Examples
python examples/01_vector_add.py
python examples/15_flash_attention.py
python examples/25_autotune.py
python examples/38_quantized_matmul.py
python examples/54_smollm2_inference.py # SmolLM2-135M full inference
python examples/59_cuda_benchmark_suite.py # NVIDIA A100 benchmarks
python examples/63_riscv_codegen.py # RISC-V codegen demoKernel Graph
# Without graph: one sync per kernel
rms_norm[(rows,), (128,)](x, w, h, N=dim, eps=1e-5)
matmul[(M * N,)](h, w2, out, M=M, N=K, K=dim)
# With graph: single command buffer, one sync at the end
with locomp.graph() as g:
g.add(rms_norm, (rows,), (128,), x, w, h, N=dim, eps=1e-5)
g.add(matmul, (M * N,), h, w2, out, M=M, N=K, K=dim)
# Or run manually and re-run
g = locomp.graph()
g.add(rms_norm, (rows,), (128,), x, w, h, N=dim, eps=1e-5)
g.run() # first run
g.run() # re-run same kernel sequenceProfiler
with locomp.profile() as p:
rms_norm[(rows,), (128,)](x, w, out, N=dim, eps=1e-5)
matmul[(M * N,)](a, b, c, M=M, N=N, K=K)
print(p.report())
# Kernel Grid Time
# rms_norm (128,) 0.245 ms
# matmul (4096,) 1.103 ms