Documentation

Learn how to write GPU kernels in Python with locomp.

Installation

pip install locomp

Python 3.10+. No Xcode, no nvcc, no toolchain setup required.

Apple Silicon

macOS, M1/M2/M3/M4

NVIDIA CUDA

Linux/Windows, CUDA 11+

RISC-V RVV

Linux + gcc-riscv64-linux-gnu

Quick Start

vector_add.py
import locomp
import numpy as np

@locomp.kernel
def vector_add(X: locomp.Tensor, Y: locomp.Tensor, O: locomp.Tensor,
               N: locomp.constexpr):
    i = locomp.program_id(0)
    locomp.store(O + i, locomp.load(X + i) + locomp.load(Y + i))

# Apple Silicon (Metal)
x = locomp.tensor(np.ones(4, dtype=np.float32))
y = locomp.tensor(np.ones(4, dtype=np.float32) * 2)
o = locomp.empty(4)
vector_add[(4,)](x, y, o, N=4)
print(o.numpy())  # [3. 3. 3. 3.]

# NVIDIA GPU (CUDA)
x = locomp.tensor(np.ones(4, dtype=np.float32), backend="cuda")
y = locomp.tensor(np.ones(4, dtype=np.float32) * 2, backend="cuda")
o = locomp.empty(4, backend="cuda")
vector_add[(4,)](x, y, o, N=4)
print(o.numpy())  # [3. 3. 3. 3.]

N: locomp.constexpr is inlined as a compile-time literal. Compiled pipeline cached per constexpr config.

Dispatch Model

# 1D grid — N threadgroups, 1 thread each
kernel[(N,)](args...)

# 1D grid with threadgroup size — N groups, T threads each
kernel[(N,), (T,)](args...)

# 2D grid
kernel[(gx, gy)](args...)

# 2D grid + 2D threadgroup
kernel[(gx, gy), (tx, ty)](args...)

Use locomp.program_id(axis) for threadgroup index, locomp.local_id(axis) for thread index within group.

Shared Memory & Sync

rms_norm.py
@locomp.kernel
def rms_norm(X: locomp.Tensor, W: locomp.Tensor, O: locomp.Tensor,
             N: locomp.constexpr, EPS: locomp.constexpr):
    row = locomp.program_id(0)
    tid = locomp.local_id(0)

    smem = locomp.shared_memory(32)
    local_sum = 0.0
    for i in range(tid, N, 128):
        val = locomp.load(X + (row * N + i))
        local_sum = local_sum + val * val

    local_sum = locomp.simd_sum(local_sum)
    if locomp.simd_lane_id() == 0:
        locomp.shared_store(smem, locomp.simd_group_id(), local_sum)
    locomp.barrier()

    if tid == 0:
        total = 0.0
        for g in range(4):
            total = total + locomp.shared_load(smem, g)
        rms = locomp.rsqrt(total / N + EPS)
        locomp.shared_store(smem, 0, rms)
    locomp.barrier()

    rms = locomp.shared_load(smem, 0)
    for i in range(tid, N, 128):
        val = locomp.load(X + (row * N + i))
        w = locomp.load(W + i)
        locomp.store(O + (row * N + i), val * rms * w)

rms_norm[(rows,), (128,)](x, weights, out, N=dim, EPS=1e-5)

SIMD Reductions

local_sum = locomp.simd_sum(val)     # Sum across 32-lane SIMD group
local_max = locomp.simd_max(val)     # Max across SIMD group
lane_id   = locomp.simd_lane_id()    # Lane index [0, 31]
group_id  = locomp.simd_group_id()   # SIMD group index within threadgroup

On Apple Silicon, a SIMD group is 32 threads (equivalent to a CUDA warp).

Auto-Tuning

from locomp import autotune, Config

@autotune(
    configs=[
        Config(BLOCK_M=16, BLOCK_N=16),
        Config(BLOCK_M=32, BLOCK_N=32),
        Config(BLOCK_M=64, BLOCK_N=64),
    ],
    key=["M", "N", "K"],
)
@locomp.kernel
def tuned_matmul(A: locomp.Tensor, B: locomp.Tensor, C: locomp.Tensor,
                 M: locomp.constexpr, N: locomp.constexpr, K: locomp.constexpr,
                 BLOCK_M: locomp.constexpr, BLOCK_N: locomp.constexpr):
    pass

Results cached to ~/.cache/locomp/autotune.json. Call locomp.clear_cache() to reset.

Float16 & Quantization

@locomp.kernel
def fp16_add(X: locomp.Tensor, Y: locomp.Tensor, O: locomp.Tensor,
             N: locomp.constexpr):
    i = locomp.program_id(0)
    x = locomp.cast(locomp.load(X + i), locomp.Float16)
    y = locomp.cast(locomp.load(Y + i), locomp.Float16)
    locomp.store(O + i, locomp.cast(x + y, locomp.Float32))

Types: locomp.Float16, locomp.BFloat16, locomp.Int8, locomp.UInt8

63 Kernel Examples

python examples/01_vector_add.py
python examples/15_flash_attention.py
python examples/25_autotune.py
python examples/38_quantized_matmul.py
python examples/54_smollm2_inference.py   # SmolLM2-135M full inference
python examples/59_cuda_benchmark_suite.py # NVIDIA A100 benchmarks
python examples/63_riscv_codegen.py        # RISC-V codegen demo

Kernel Graph

# Without graph: one sync per kernel
rms_norm[(rows,), (128,)](x, w, h, N=dim, eps=1e-5)
matmul[(M * N,)](h, w2, out, M=M, N=K, K=dim)

# With graph: single command buffer, one sync at the end
with locomp.graph() as g:
    g.add(rms_norm, (rows,), (128,), x, w, h, N=dim, eps=1e-5)
    g.add(matmul,   (M * N,),       h, w2, out, M=M, N=K, K=dim)

# Or run manually and re-run
g = locomp.graph()
g.add(rms_norm, (rows,), (128,), x, w, h, N=dim, eps=1e-5)
g.run()  # first run
g.run()  # re-run same kernel sequence

Profiler

with locomp.profile() as p:
    rms_norm[(rows,), (128,)](x, w, out, N=dim, eps=1e-5)
    matmul[(M * N,)](a, b, c, M=M, N=N, K=K)

print(p.report())
# Kernel          Grid     Time
# rms_norm        (128,)   0.245 ms
# matmul          (4096,)  1.103 ms