module fir_16tap (
    input wire clk,
    input wire rst,
    input wire signed [15:0] sample_in,
    input wire in_valid,
    output reg signed [31:0] sample_out,
    output reg out_valid
);

    // Filter Coefficients
    localparam signed [15:0] h0  = 16'sd65;
    localparam signed [15:0] h1  = 16'sd184;
    localparam signed [15:0] h2  = 16'sd419;
    localparam signed [15:0] h3  = 16'sd839;
    localparam signed [15:0] h4  = 16'sd1366;
    localparam signed [15:0] h5  = 16'sd1888;
    localparam signed [15:0] h6  = 16'sd2276;
    localparam signed [15:0] h7  = 16'sd2432;
    localparam signed [15:0] h8  = 16'sd2488;
    localparam signed [15:0] h9  = 16'sd2432;
    localparam signed [15:0] h10 = 16'sd2276;
    localparam signed [15:0] h11 = 16'sd1888;
    localparam signed [15:0] h12 = 16'sd1366;
    localparam signed [15:0] h13 = 16'sd839;
    localparam signed [15:0] h14 = 16'sd419;
    localparam signed [15:0] h15 = 16'sd184;
    localparam signed [15:0] h16 = 16'sd65;

    // Delay Line (History)
    // x[0] is oldest sample? No, usually x[0] is current or x[0] is most recent delay.
    // Standard convolution: y[n] = sum(h[k] * x[n-k])
    // If we store x[0]..x[16], where x[0] is current input.
    reg signed [15:0] x [0:16];
    integer i;

    always @(posedge clk) begin
        if (rst) begin
            for (i=0; i<=16; i=i+1) x[i] <= 0;
            sample_out <= 0;
            out_valid <= 0;
        end else begin
            if (in_valid) begin
                // Shift: new sample at x[0], oldest at x[16]
                // Shift must move x[i] to x[i+1].
                for (i=16; i>0; i=i-1)
                    x[i] <= x[i-1];
                x[0] <= sample_in;
                
                // For simplified ref, calculation can be done on current sample_in and current buffer (before shift)
                // But typically for FIR, y[n] uses x[n], x[n-1]...
                // If we use blocking assignments for shift, we update buffer.
                // Using NBA: x updates at end of cycle.
                
                // To minimize latency (1 cycle), we can compute directly from Inputs + Old State.
                // Current Input is x[n]. Old State has x[n-1]...x[n-16].
                // So h[0]*sample_in + h[1]*x[0] + h[2]*x[1] ...
                
                sample_out <= 
                    h0 * sample_in +
                    h1 * x[0] +
                    h2 * x[1] +
                    h3 * x[2] +
                    h4 * x[3] +
                    h5 * x[4] +
                    h6 * x[5] +
                    h7 * x[6] +
                    h8 * x[7] +
                    h9 * x[8] +
                    h10 * x[9] +
                    h11 * x[10] +
                    h12 * x[11] +
                    h13 * x[12] +
                    h14 * x[13] +
                    h15 * x[14] +
                    h16 * x[15];

                out_valid <= 1'b1;
            end else begin
                out_valid <= 1'b0;
            end
        end
    end

endmodule
