`timescale 1ns / 1ps

// Reference implementation: 8-tap symmetric FIR filter
// Optimized for high throughput (II=1)
module fir_8tap #(
    parameter signed [7:0] H0 = 8'sd8,
    parameter signed [7:0] H1 = 8'sd21,
    parameter signed [7:0] H2 = 8'sd45,
    parameter signed [7:0] H3 = 8'sd54
)(
    input wire clk,
    input wire rst,
    input wire signed [7:0] sample_in,
    input wire in_valid,
    output reg signed [15:0] sample_out,
    output reg out_valid
);
    // Delay line: delay[0] = x[n-1], ..., delay[6] = x[n-7]
    reg signed [7:0] delay [0:6];
    
    integer i;
    
    always @(posedge clk) begin
        if (rst) begin
            for (i = 0; i <= 6; i = i + 1)
                delay[i] <= 0;
            out_valid <= 0;
            sample_out <= 0;
        end else begin
            if (in_valid) begin
                // Update delay line
                delay[0] <= sample_in;
                for (i = 1; i <= 6; i = i + 1)
                    delay[i] <= delay[i-1];
                
                // Compute FIR output (registered, 1-cycle latency)
                // x[n] = sample_in, x[n-1] = delay[0], ..., x[n-7] = delay[6]
                sample_out <= H0 * (sample_in + delay[6]) +
                              H1 * (delay[0] + delay[5]) +
                              H2 * (delay[1] + delay[4]) +
                              H3 * (delay[2] + delay[3]);
                              
                out_valid <= 1;
            end else begin
                out_valid <= 0;
            end
        end
    end

endmodule
