module streaming_5point_stencil (
    input wire clk,
    input wire rst,
    input wire in_valid,
    input wire tile_start,
    input wire signed [7:0] sample_in,
    output reg out_valid,
    output reg signed [10:0] stencil_out
);
    reg signed [7:0] tile_mem [0:63];
    reg [5:0] sample_idx;
    reg pending_valid;
    reg signed [10:0] pending_data;

    integer idx;
    integer row;
    integer col;
    integer north_idx;
    integer south_idx;
    integer west_idx;
    integer east_idx;
    integer center_idx;
    integer accum;

    always @(posedge clk) begin
        if (rst) begin
            sample_idx <= 6'd0;
            pending_valid <= 1'b0;
            pending_data <= 11'sd0;
            out_valid <= 1'b0;
            stencil_out <= 11'sd0;
        end else begin
            out_valid <= pending_valid;
            if (pending_valid) begin
                stencil_out <= pending_data;
            end else begin
                stencil_out <= 11'sd0;
            end

            pending_valid <= 1'b0;
            pending_data <= 11'sd0;

            if (in_valid) begin
                if (tile_start) begin
                    idx = 0;
                end else begin
                    idx = sample_idx;
                end

                row = idx >> 3;
                col = idx & 3'b111;
                tile_mem[idx] <= sample_in;

                if ((row >= 2) && (col >= 2)) begin
                    north_idx = ((row - 2) << 3) + (col - 1);
                    south_idx = (row << 3) + (col - 1);
                    west_idx = ((row - 1) << 3) + (col - 2);
                    east_idx = ((row - 1) << 3) + col;
                    center_idx = ((row - 1) << 3) + (col - 1);

                    accum =
                        $signed(tile_mem[north_idx]) +
                        $signed(tile_mem[south_idx]) +
                        $signed(tile_mem[west_idx]) +
                        $signed(tile_mem[east_idx]) -
                        (4 * $signed(tile_mem[center_idx]));

                    pending_valid <= 1'b1;
                    pending_data <= accum[10:0];
                end

                if (idx == 63) begin
                    sample_idx <= 6'd0;
                end else begin
                    sample_idx <= idx[5:0] + 6'd1;
                end
            end
        end
    end
endmodule
