A Coding Implementation to Master GPU Computing with CuPy, Custom CUDA Kernels, Streams, Sparse Matrices, and Profiling

header("6. RAW CUDA KERNEL — MANDELBROT")
mandel = cp.RawKernel(r'''
extern "C" __global__
void mandel(float xmin, float xmax, float ymin, float ymax,
           int W, int H, int max_iter, int* out) {
   int ix = blockDim.x * blockIdx.x + threadIdx.x;
   int iy = blockDim.y * blockIdx.y + threadIdx.y;
   if (ix >= W || iy >= H) return;
   float cx = xmin + (xmax - xmin) * ix / (W - 1);
   float cy = ymin + (ymax - ymin) * iy / (H - 1);
   float zx = 0.f, zy = 0.f;
   int it = 0;
   while (zx*zx + zy*zy < 4.f && it < max_iter) {
       float t = zx*zx - zy*zy + cx;
       zy = 2.f*zx*zy + cy;
       zx = t; ++it;
   }
   out[iy*W + ix] = it;
}
''', 'mandel')
W, H, ITER = 1024, 1024, 400
img = cp.zeros((H, W), dtype=cp.int32)
threads = (16, 16)
blocks = ((W + 15)//16, (H + 15)//16)
mandel(blocks, threads,
      (cp.float32(-2.0), cp.float32(1.0),
       cp.float32(-1.5), cp.float32(1.5),
       W, H, ITER, img))
cp.cuda.Stream.null.synchronize()
print(f"Mandelbrot done. max iter reached={int(img.max())}")
plt.figure(figsize=(6,6))
plt.imshow(cp.asnumpy(cp.log1p(img)), cmap='twilight_shifted', extent=[-2,1,-1.5,1.5])
plt.title("Mandelbrot set — computed with a CuPy RawKernel")
plt.axis('off'); plt.show()
header("7. CUDA STREAMS")
s1, s2 = cp.cuda.Stream(non_blocking=True), cp.cuda.Stream(non_blocking=True)
with s1:
   a1 = cp.random.rand(2000, 2000, dtype=cp.float32)
   b1 = cp.random.rand(2000, 2000, dtype=cp.float32)
   c1 = a1 @ b1
with s2:
   a2 = cp.random.rand(2000, 2000, dtype=cp.float32)
   b2 = cp.random.rand(2000, 2000, dtype=cp.float32)
   c2 = a2 @ b2
s1.synchronize(); s2.synchronize()
print(f"Stream-1 mean={float(c1.mean()):.4f}")
print(f"Stream-2 mean={float(c2.mean()):.4f}")