Andrew Ceniccola

```cpp #include

#define TILE_SIZE 32

/**

@brief General Tiled Matrix Multiplication Kernel (C = A * B)
@param A Pointer to Matrix A (Size: M x K)
@param B Pointer to Matrix B (Size: K x N)
@param C Pointer to Output Matrix C (Size: M x N)
@param M Number of rows in A and C
@param N Number of columns in B and C

@param K Inner dimension (Cols of A, Rows of B) / global void general_tiled_gemm_kernel(const float A, const float* B, float* C, int M, int N, int K) {

// 1. Global Coordinates for the Output Matrix C int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x;

// Allocate Shared Memory Tiles shared float s_A[TILE_SIZE][TILE_SIZE]; shared float s_B[TILE_SIZE][TILE_SIZE];

float c_value = 0.0f;

// 2. Calculate the sliding distance along the inner dimension (K) // We use ceil division to ensure we process the “leftover” chunk at the end of K int num_tiles = (K + TILE_SIZE - 1) / TILE_SIZE;

for (int t = 0; t < num_tiles; ++t) {

 // --- PHASE A: Load Matrix A with Boundary Checks ---
 // A thread is assigned to load a specific cell from A.
 // The row is fixed, but the column slides horizontally by (t * TILE_SIZE)
 int a_col = t * TILE_SIZE + threadIdx.x;
    
 // Check if the physical thread is within the actual bounds of Matrix A
 if (row < M && a_col < K) {
     // Note how we calculate a 1D index for a 2D matrix: (row * row_width + col)
     s_A[threadIdx.y][threadIdx.x] = A[row * K + a_col];
 } else {
     // ZERO-PADDING: If the thread is outside the matrix, load a 0.
     // This ensures the math later on isn't corrupted by garbage memory.
     s_A[threadIdx.y][threadIdx.x] = 0.0f;
 }


 // --- PHASE B: Load Matrix B with Boundary Checks ---
 // The column is fixed, but the row slides vertically down by (t * TILE_SIZE)
 int b_row = t * TILE_SIZE + threadIdx.y;
    
 if (b_row < K && col < N) {
     s_B[threadIdx.y][threadIdx.x] = B[b_row * N + col];
 } else {
     s_B[threadIdx.y][threadIdx.x] = 0.0f; // ZERO-PADDING
 }

 // Wait for all threads to finish loading and padding
 __syncthreads();


 // --- PHASE C: Compute ---
 // We always loop a full TILE_SIZE times.
 // Because we padded with 0.0f, out-of-bounds elements simply add 0 to c_value!
 for (int i = 0; i < TILE_SIZE; ++i) {
     c_value += s_A[threadIdx.y][i] * s_B[i][threadIdx.x];
 }

 // Wait for all math to finish before overwriting shared memory in the next loop
 __syncthreads();  }

// — PHASE D: Write to Output Matrix C — // Finally, ensure only threads that correspond to valid cells in Matrix C write the result. // The “ghost threads” that fell off the edge during the grid launch do nothing here. if (row < M && col < N) { C[row * N + col] = c_value; } }

—

void launch_general_gemm(float* d_A, float* d_B, float* d_C, int M, int N, int K) {

dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE);

// We must use "ceiling division" to calculate the number of blocks.
// Integer division rounds down. If N=100 and TILE_SIZE=32, 100/32 = 3.
// The formula (X + TILE_SIZE - 1) / TILE_SIZE forces the division to round UP.

int blocks_x = (N + TILE_SIZE - 1) / TILE_SIZE; // Covers columns
int blocks_y = (M + TILE_SIZE - 1) / TILE_SIZE; // Covers rows

dim3 numBlocks(blocks_x, blocks_y);

general_tiled_gemm_kernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, M, N, K);
cudaDeviceSynchronize(); }