avir/Documentation/a00017_source.html

#ifndef AVIR_CLANCIR_INCLUDED

#define AVIR_CLANCIR_INCLUDED


#include <cstring>

#include <cmath>


#if __cplusplus >= 201103L


    #include <cstdint>


#else // __cplusplus >= 201103L


    #include <stdint.h>


#endif // __cplusplus >= 201103L


#if defined( __AVX__ )


    #include <immintrin.h>


    #define LANCIR_AVX

    #define LANCIR_SSE2 // Some functions use SSE2; AVX has a higher priority.

    #define LANCIR_ALIGN 32


#elif defined( __aarch64__ ) || defined( __arm64__ ) || \

    defined( _M_ARM64 ) || defined( _M_ARM64EC )


    #if defined( _MSC_VER )

        #include <arm64_neon.h>


        #if _MSC_VER < 1925

            #define LANCIR_ARM32 // Do not use some newer NEON intrinsics.

        #endif // _MSC_VER < 1925

    #else // defined( _MSC_VER )

        #include <arm_neon.h>

    #endif // defined( _MSC_VER )


    #define LANCIR_NEON

    #define LANCIR_ALIGN 16


#elif defined( __ARM_NEON ) || defined( __ARM_NEON__ ) || defined( _M_ARM )


    #include <arm_neon.h>


    #define LANCIR_ARM32

    #define LANCIR_NEON

    #define LANCIR_ALIGN 16


#elif defined( __SSE2__ ) || defined( _M_AMD64 ) || \

    ( defined( _M_IX86_FP ) && _M_IX86_FP == 2 )


    #if defined( _MSC_VER )

        #include <intrin.h>

    #else // defined( _MSC_VER )

        #include <emmintrin.h>

    #endif // defined( _MSC_VER )


    #define LANCIR_SSE2

    #define LANCIR_ALIGN 16


#elif defined( __wasm_simd128__ )


    #include <wasm_simd128.h>


    #define LANCIR_WASM

    #define LANCIR_ALIGN 16


#else // WASM


    #define LANCIR_ALIGN 4


#endif // WASM


#if defined( LANCIR_SSE2 )


    #define lancvec_t __m128

    #define lancvec_const_splat( v ) _mm_set1_ps( v )

    #define lancvec_load( m ) _mm_load_ps( m )

    #define lancvec_loadu( m ) _mm_loadu_ps( m )

    #define lancvec_store( m, v ) _mm_store_ps( m, v )

    #define lancvec_storeu( m, v ) _mm_storeu_ps( m, v )

    #define lancvec_add( v1, v2 ) _mm_add_ps( v1, v2 )

    #define lancvec_mul( v1, v2 ) _mm_mul_ps( v1, v2 )

    #define lancvec_min( v1, v2 ) _mm_min_ps( v1, v2 )

    #define lancvec_max( v1, v2 ) _mm_max_ps( v1, v2 )

    #define lancvec_madd( va, v1, v2 ) _mm_add_ps( va, _mm_mul_ps( v1, v2 ))

    #define lancvec_addhl( vl, vh ) _mm_add_ps( vl, _mm_movehl_ps( vh, vh ))

    #define lancvec_store32_addhl( m, v ) \

        _mm_store_ss( m, _mm_add_ss( v, _mm_shuffle_ps( v, v, 1 )))


    #define lancvec_store64_addhl( m, v ) \

        _mm_storel_pi( (__m64*) ( m ), lancvec_addhl( v, v ))


#elif defined( LANCIR_NEON )


    #define lancvec_t float32x4_t

    #define lancvec_const_splat( v ) vdupq_n_f32( v )

    #define lancvec_load( m ) vld1q_f32( m )

    #define lancvec_store( m, v ) vst1q_f32( m, v )

    #define lancvec_add( v1, v2 ) vaddq_f32( v1, v2 )

    #define lancvec_mul( v1, v2 ) vmulq_f32( v1, v2 )

    #define lancvec_min( v1, v2 ) vminq_f32( v1, v2 )

    #define lancvec_max( v1, v2 ) vmaxq_f32( v1, v2 )

    #define lancvec_madd( va, v1, v2 ) vmlaq_f32( va, v1, v2 )


    #if defined( LANCIR_ARM32 )

        #define lancvec_store32_hadd( m, v ) { \

            const float32x2_t v2 = vadd_f32( vget_high_f32( v ), \

                vget_low_f32( v )); \

            *( m ) = vget_lane_f32( v2, 0 ) + \

                vget_lane_f32( v2, 1 ); } (void) 0

    #else // defined( LANCIR_ARM32 )

        #define lancvec_store32_hadd( m, v ) *( m ) = vaddvq_f32( v )

    #endif // defined( LANCIR_ARM32 )


    #define lancvec_store64_addhl( m, v ) \

        vst1_f32( m, vadd_f32( vget_high_f32( v ), vget_low_f32( v )));


#elif defined( LANCIR_WASM )


    #define lancvec_t v128_t

    #define lancvec_const_splat( v ) wasm_f32x4_const_splat( v )

    #define lancvec_load32_splat( m ) wasm_v128_load32_splat( m )

    #define lancvec_load( m ) wasm_v128_load( m )

    #define lancvec_store( m, v ) wasm_v128_store( m, v )

    #define lancvec_add( v1, v2 ) wasm_f32x4_add( v1, v2 )

    #define lancvec_mul( v1, v2 ) wasm_f32x4_mul( v1, v2 )

    #define lancvec_min( v1, v2 ) wasm_f32x4_min( v1, v2 )

    #define lancvec_max( v1, v2 ) wasm_f32x4_max( v1, v2 )

    #define lancvec_madd( va, v1, v2 ) wasm_f32x4_add( va, \

        wasm_f32x4_mul( v1, v2 ))


    #define lancvec_addhl( vl, vh ) wasm_f32x4_add( vl, \

        wasm_i32x4_shuffle( vh, vh, 6, 7, 2, 3 ))


    #define lancvec_store32_addhl( m, v ) \

        *( m ) = ( wasm_f32x4_extract_lane( v, 0 ) + \

            wasm_f32x4_extract_lane( v, 1 ))


    #define lancvec_store64_addhl( m, v ) \

        wasm_v128_store64_lane( m, lancvec_addhl( v, v ), 0 )


#endif // defined( LANCIR_WASM )


#if LANCIR_ALIGN > 4


    #if !defined( lancvec_load32_splat )

        #define lancvec_load32_splat( m ) lancvec_const_splat( *( m ))

    #endif // !defined( lancvec_load32_splat )


    #if !defined( lancvec_loadu )

        #define lancvec_loadu( m ) lancvec_load( m )

    #endif // !defined( lancvec_loadu )


    #if !defined( lancvec_storeu )

        #define lancvec_storeu( m, v ) lancvec_store( m, v )

    #endif // !defined( lancvec_storeu )


    #if !defined( lancvec_store32_hadd )

        #define lancvec_store32_hadd( m, v ) { \

            const lancvec_t v2 = lancvec_addhl( v, v ); \

            lancvec_store32_addhl( m, v2 ); } (void) 0

    #endif // !defined( lancvec_store32_hadd )


#endif // LANCIR_ALIGN > 4


namespace avir {


using std :: memcpy;

using std :: memset;

using std :: fabs;

using std :: floor;

using std :: ceil;

using std :: sin;

using std :: cos;

using std :: size_t;


#if __cplusplus >= 201103L


    using std :: intptr_t;

    using std :: uintptr_t;


#else // __cplusplus >= 201103L


    // Workaround for pre-C++11 compilers. `nullptr` is a keyword, and not a

    // macro, but check if such workaround is already in place.


    #if !defined( nullptr )

        #define nullptr NULL

        #define LANCIR_NULLPTR

    #endif // !defined( nullptr )


#endif // __cplusplus >= 201103L


class CLancIRParams

{

public:

    int SrcSSize;

    int NewSSize;

    double kx;

    double ky;

    double ox;

    double oy;

    double la;


    CLancIRParams( const int aSrcSSize = 0, const int aNewSSize = 0,

        const double akx = 0.0, const double aky = 0.0,

        const double aox = 0.0, const double aoy = 0.0 )

        : SrcSSize( aSrcSSize )

        , NewSSize( aNewSSize )

        , kx( akx )

        , ky( aky )

        , ox( aox )

        , oy( aoy )

        , la( 3.0 )

    {

    }


};


class CLancIR

{

private:

    CLancIR( const CLancIR& )

    {

        // Unsupported.

    }


    CLancIR& operator = ( const CLancIR& )

    {

        // Unsupported.

        return( *this );

    }


public:

    CLancIR()

        : FltBuf0( nullptr )

        , FltBuf0Len( 0 )

        , spv0( nullptr )

        , spv0len( 0 )

        , spv( nullptr )

    {

    }


    ~CLancIR()

    {

        delete[] FltBuf0;

        delete[] spv0;

    }


    template< typename Tin, typename Tout >


    int resizeImage( const Tin* const SrcBuf, const int SrcWidth,

        const int SrcHeight, Tout* const NewBuf, const int NewWidth,

        const int NewHeight, const int ElCount,

        const CLancIRParams* const aParams = nullptr )

    {

        if(( SrcWidth < 0 ) | ( SrcHeight < 0 ) |

            ( NewWidth <= 0 ) | ( NewHeight <= 0 ) |

            ( SrcBuf == nullptr ) | ( NewBuf == nullptr ) |

            ( (const void*) SrcBuf == (const void*) NewBuf ))

        {

            return( 0 );

        }


        static const CLancIRParams DefParams;

        const CLancIRParams& Params = ( aParams != nullptr ?

            *aParams : DefParams );


        if( Params.la < 2.0 )

        {

            return( 0 );

        }


        const int OutSLen = NewWidth * ElCount;

        const size_t NewScanlineSize = (size_t) ( Params.NewSSize < 1 ?

            OutSLen : Params.NewSSize );


        if(( SrcWidth == 0 ) | ( SrcHeight == 0 ))

        {

            Tout* op = NewBuf;

            int i;


            for( i = 0; i < NewHeight; i++ )

            {

                memset( op, 0, (size_t) OutSLen * sizeof( Tout ));

                op += NewScanlineSize;

            }


            return( NewHeight );

        }


        const size_t SrcScanlineSize = (size_t) ( Params.SrcSSize < 1 ?

            SrcWidth * ElCount : Params.SrcSSize );


        double ox = Params.ox;

        double oy = Params.oy;

        double kx;

        double ky;


        if( Params.kx >= 0.0 )

        {

            kx = ( Params.kx == 0.0 ?

                (double) SrcWidth / NewWidth : Params.kx );


            ox += ( kx - 1.0 ) * 0.5;

        }

        else

        {

            kx = -Params.kx;

        }


        if( Params.ky >= 0.0 )

        {

            ky = ( Params.ky == 0.0 ?

                (double) SrcHeight / NewHeight : Params.ky );


            oy += ( ky - 1.0 ) * 0.5;

        }

        else

        {

            ky = -Params.ky;

        }


        if( rfv.update( Params.la, ky, ElCount ))

        {

            rsv.reset();

            rsh.reset();

        }


        CResizeFilters* rfh; // Pointer to resizing filters for horizontal

            // resizing, may equal to `rfv` if the same stepping is in use.


        if( kx == ky )

        {

            rfh = &rfv;

        }

        else

        {

            rfh = &rfh0;


            if( rfh0.update( Params.la, kx, ElCount ))

            {

                rsh.reset();

            }

        }


        rsv.update( SrcHeight, NewHeight, oy, rfv, spv );

        rsh.update( SrcWidth, NewWidth, ox, *rfh );


        // Calculate vertical progressive resizing's batch size. Progressive

        // batching is used to try to keep addressing within the cache

        // capacity. This technique definitely works well for single-threaded

        // resizing on most CPUs, but may not provide an additional benefit

        // for multi-threaded resizing, or in a system-wide high-load

        // situations.


        const size_t FltWidthE = (size_t) (( rsh.padl + SrcWidth +

            rsh.padr ) * ElCount );


        const double CacheSize = 5500000.0; // Tuned for various CPUs.

        const double OpSize = (double) SrcScanlineSize * SrcHeight *

            sizeof( Tin ) + (double) FltWidthE * NewHeight * sizeof( float );


        int BatchSize = (int) ( NewHeight * CacheSize / ( OpSize + 1.0 ));


        if( BatchSize < 8 )

        {

            BatchSize = 8;

        }


        if( BatchSize > NewHeight )

        {

            BatchSize = NewHeight;

        }


        // Allocate/resize intermediate buffers.


        const int svs = ( rsv.padl + SrcHeight + rsv.padr ) * ElCount;

        float* const pspv0 = spv0;

        reallocBuf( spv0, spv, spv0len, ( svs > OutSLen ? svs : OutSLen ));

        reallocBuf( FltBuf0, FltBuf, FltBuf0Len,

            FltWidthE * (size_t) BatchSize );


        if( spv0 != pspv0 )

        {

            rsv.updateSPO( rfv, spv );

        }


        // Prepare output-related constants.


        static const bool IsInFloat = ( (Tin) 0.25f != 0 );

        static const bool IsOutFloat = ( (Tout) 0.25f != 0 );

        static const bool IsUnityMul = ( IsInFloat && IsOutFloat ) ||

            ( IsInFloat == IsOutFloat && sizeof( Tin ) == sizeof( Tout ));


        const float Clamp = ( sizeof( Tout ) == 1 ? 255.0f : 65535.0f );

        const float OutMul = ( IsOutFloat ? 1.0f : Clamp ) /

            ( IsInFloat ? 1.0f : ( sizeof( Tin ) == 1 ? 255.0f : 65535.0f ));


        // Perform batched resizing.


        const CResizePos* rpv = rsv.pos;

        Tout* opn = NewBuf;

        int bl = NewHeight;


        while( bl > 0 )

        {

            const int bc = ( bl > BatchSize ? BatchSize : bl );


            int kl = rfv.KernelLen;

            const Tin* ip = SrcBuf;

            float* op = FltBuf + rsh.padl * ElCount;


            const int so = (int) rpv[ 0 ].so;

            float* const sp = spv + so * ElCount;


            int cc = (int) rpv[ bc - 1 ].so - so + kl; // Pixel copy count.

            int rl = 0; // Leftmost pixel's replication count.

            int rr = 0; // Rightmost pixel's replication count.


            const int socc = so + cc;

            const int spe = rsv.padl + SrcHeight;


            // Calculate scanline copying and padding parameters, depending on

            // the batch's size and its vertical offset.


            if( so < rsv.padl )

            {

                if( socc <= rsv.padl )

                {

                    rl = cc;

                    cc = 0;

                }

                else

                {

                    if( socc > spe )

                    {

                        rr = socc - spe;

                        cc -= rr;

                    }


                    rl = rsv.padl - so;

                    cc -= rl;

                }

            }

            else

            {

                if( so >= spe )

                {

                    rr = cc;

                    cc = 0;

                    ip += (size_t) SrcHeight * SrcScanlineSize;

                }

                else

                {

                    if( socc > spe )

                    {

                        rr = socc - spe;

                        cc -= rr;

                    }


                    ip += (size_t) ( so - rsv.padl ) * SrcScanlineSize;

                }

            }


            // Batched vertical resizing.


            int i;


            if( ElCount == 1 )

            {

                for( i = 0; i < SrcWidth; i++ )

                {

                    copyScanline1v( ip, SrcScanlineSize, sp, cc, rl, rr );

                    resize1< false >( nullptr, op, FltWidthE, rpv, kl, bc );

                    ip += 1;

                    op += 1;

                }

            }

            else

            if( ElCount == 2 )

            {

                for( i = 0; i < SrcWidth; i++ )

                {

                    copyScanline2v( ip, SrcScanlineSize, sp, cc, rl, rr );

                    resize2< false >( nullptr, op, FltWidthE, rpv, kl, bc );

                    ip += 2;

                    op += 2;

                }

            }

            else

            if( ElCount == 3 )

            {

                for( i = 0; i < SrcWidth; i++ )

                {

                    copyScanline3v( ip, SrcScanlineSize, sp, cc, rl, rr );

                    resize3< false >( nullptr, op, FltWidthE, rpv, kl, bc );

                    ip += 3;

                    op += 3;

                }

            }

            else // ElCount == 4

            {

                for( i = 0; i < SrcWidth; i++ )

                {

                    copyScanline4v( ip, SrcScanlineSize, sp, cc, rl, rr );

                    resize4< false >( nullptr, op, FltWidthE, rpv, kl, bc );

                    ip += 4;

                    op += 4;

                }

            }


            // Perform horizontal resizing batch, and produce final output.


            float* ipf = FltBuf;

            kl = rfh -> KernelLen;


            if( ElCount == 1 )

            {

                for( i = 0; i < bc; i++ )

                {

                    padScanline1h( ipf, rsh, SrcWidth );

                    resize1< true >( ipf, spv, 1, rsh.pos, kl, NewWidth );

                    outputScanline< IsOutFloat, IsUnityMul >( spv, opn,

                        OutSLen, Clamp, OutMul );


                    ipf += FltWidthE;

                    opn += NewScanlineSize;

                }

            }

            else

            if( ElCount == 2 )

            {

                for( i = 0; i < bc; i++ )

                {

                    padScanline2h( ipf, rsh, SrcWidth );

                    resize2< true >( ipf, spv, 2, rsh.pos, kl, NewWidth );

                    outputScanline< IsOutFloat, IsUnityMul >( spv, opn,

                        OutSLen, Clamp, OutMul );


                    ipf += FltWidthE;

                    opn += NewScanlineSize;

                }

            }

            else

            if( ElCount == 3 )

            {

                for( i = 0; i < bc; i++ )

                {

                    padScanline3h( ipf, rsh, SrcWidth );

                    resize3< true >( ipf, spv, 3, rsh.pos, kl, NewWidth );

                    outputScanline< IsOutFloat, IsUnityMul >( spv, opn,

                        OutSLen, Clamp, OutMul );


                    ipf += FltWidthE;

                    opn += NewScanlineSize;

                }

            }

            else // ElCount == 4

            {

                for( i = 0; i < bc; i++ )

                {

                    padScanline4h( ipf, rsh, SrcWidth );

                    resize4< true >( ipf, spv, 4, rsh.pos, kl, NewWidth );

                    outputScanline< IsOutFloat, IsUnityMul >( spv, opn,

                        OutSLen, Clamp, OutMul );


                    ipf += FltWidthE;

                    opn += NewScanlineSize;

                }

            }


            rpv += bc;

            bl -= bc;

        }


        return( NewHeight );

    }


    template< typename Tin, typename Tout >


    int resizeImage( const Tin* const SrcBuf, const int SrcWidth,

        const int SrcHeight, const int SrcSSize, Tout* const NewBuf,

        const int NewWidth, const int NewHeight, const int NewSSize,

        const int ElCount, const double kx0 = 0.0, const double ky0 = 0.0,

        double ox = 0.0, double oy = 0.0 )

    {

        const CLancIRParams Params( SrcSSize, NewSSize, kx0, ky0, ox, oy );


        return( resizeImage( SrcBuf, SrcWidth, SrcHeight, NewBuf, NewWidth,

            NewHeight, ElCount, &Params ));

    }


protected:

    float* FltBuf0;

    size_t FltBuf0Len;

    float* FltBuf;

    float* spv0;

    int spv0len;

    float* spv;


    template< typename Tb, typename Tl >


    static void reallocBuf( Tb*& buf0, Tb*& buf, Tl& len, Tl newlen )

    {

        newlen += LANCIR_ALIGN;


        if( newlen > len )

        {

            if( buf0 != nullptr )

            {

                delete[] buf0;

                buf0 = nullptr;

                len = 0;

            }


            buf0 = new Tb[ newlen ];

            len = newlen;

            buf = (Tb*) (( (uintptr_t) buf0 + LANCIR_ALIGN - 1 ) &

                ~(uintptr_t) ( LANCIR_ALIGN - 1 ));

        }

    }


    template< typename Tb, typename Tl >


    static void reallocBuf( Tb*& buf, Tl& len, const Tl newlen )

    {

        if( newlen > len )

        {

            if( buf != nullptr )

            {

                delete[] buf;

                buf = nullptr;

                len = 0;

            }


            buf = new Tb[ newlen ];

            len = newlen;

        }

    }


    class CResizeScanline;


    class CResizeFilters

    {

        friend class CResizeScanline;


    public:

        int KernelLen;


        CResizeFilters()

            : Filters( nullptr )

            , FiltersLen( 0 )

            , la( 0.0 )

        {

            memset( Bufs0, 0, sizeof( Bufs0 ));

            memset( Bufs0Len, 0, sizeof( Bufs0Len ));

        }


        ~CResizeFilters()

        {

            int i;


            for( i = 0; i < BufCount; i++ )

            {

                delete[] Bufs0[ i ];

            }


            delete[] Filters;

        }


        bool update( const double la0, const double k0, const int ElCount0 )

        {

            if( la0 == la && k0 == k && ElCount0 == ElCount )

            {

                return( false );

            }


            const double NormFreq = ( k0 <= 1.0 ? 1.0 : 1.0 / k0 );

            Freq = 3.1415926535897932 * NormFreq;

            FreqA = Freq / la0;


            Len2 = la0 / NormFreq;

            fl2 = (int) ceil( Len2 );

            KernelLen = fl2 + fl2;


            #if LANCIR_ALIGN > 4


                ElRepl = ElCount0;

                KernelLenA = KernelLen * ElRepl;


                const int elalign =

                    (int) ( LANCIR_ALIGN / sizeof( float )) - 1;


                KernelLenA = ( KernelLenA + elalign ) & ~elalign;


            #else // LANCIR_ALIGN > 4


                ElRepl = 1;

                KernelLenA = KernelLen;


            #endif // LANCIR_ALIGN > 4


            FracCount = 1000; // Enough for Lanczos implicit 8-bit precision.


            la = 0.0;

            reallocBuf( Filters, FiltersLen, FracCount + 1 );


            memset( Filters, 0, (size_t) FiltersLen * sizeof( Filters[ 0 ]));


            setBuf( 0 );


            la = la0;

            k = k0;

            ElCount = ElCount0;


            return( true );

        }


        const float* getFilter( const double x )

        {

            const int Frac = (int) ( x * FracCount + 0.5 );

            float* flt = Filters[ Frac ];


            if( flt != nullptr )

            {

                return( flt );

            }


            flt = Bufs[ CurBuf ] + CurBufFill * KernelLenA;

            Filters[ Frac ] = flt;

            CurBufFill++;


            if( CurBufFill == BufLen )

            {

                setBuf( CurBuf + 1 );

            }


            makeFilterNorm( flt, 1.0 - (double) Frac / FracCount );


            if( ElRepl > 1 )

            {

                replicateFilter( flt, KernelLen, ElRepl );

            }


            return( flt );

        }


    protected:

        double Freq;

        double FreqA;

        double Len2;

        int fl2;

        int FracCount;

        int KernelLenA;

        int ElRepl;

        static const int BufCount = 4;

        static const int BufLen = 256;

        float* Bufs0[ BufCount ];

        int Bufs0Len[ BufCount ];

        float* Bufs[ BufCount ];

        int CurBuf;

        int CurBufFill;

        float** Filters;

        int FiltersLen;

        double la;

        double k;

        int ElCount;


        void setBuf( const int bi )

        {

            reallocBuf( Bufs0[ bi ], Bufs[ bi ], Bufs0Len[ bi ],

                BufLen * KernelLenA );


            CurBuf = bi;

            CurBufFill = 0;

        }


        class CSineGen

        {

        public:


            CSineGen( const double si, const double ph )

                : svalue1( sin( ph ))

                , svalue2( sin( ph - si ))

                , sincr( 2.0 * cos( si ))

            {

            }


            double generate()

            {

                const double res = svalue1;


                svalue1 = sincr * res - svalue2;

                svalue2 = res;


                return( res );

            }


        private:

            double svalue1;

            double svalue2;

            double sincr;

        };


        void makeFilterNorm( float* op, const double FracDelay ) const

        {

            CSineGen f( Freq, Freq * ( FracDelay - fl2 ));

            CSineGen fw( FreqA, FreqA * ( FracDelay - fl2 ));


            float* op0 = op;

            double s = 0.0;

            double ut;


            int t = -fl2;


            if( t + FracDelay < -Len2 )

            {

                f.generate();

                fw.generate();

                *op = 0;

                op++;

                t++;

            }


            int IsZeroX = ( fabs( FracDelay - 1.0 ) < 2.3e-13 );

            int mt = 0 - IsZeroX;

            IsZeroX |= ( fabs( FracDelay ) < 2.3e-13 );


            while( t < mt )

            {

                ut = t + FracDelay;

                *op = (float) ( f.generate() * fw.generate() / ( ut * ut ));

                s += *op;

                op++;

                t++;

            }


            if( IsZeroX ) // t+FracDelay==0

            {

                *op = (float) ( Freq * FreqA );

                s += *op;

                f.generate();

                fw.generate();

            }

            else

            {

                ut = FracDelay; // t==0

                *op = (float) ( f.generate() * fw.generate() / ( ut * ut ));

                s += *op;

            }


            mt = fl2 - 2;


            while( t < mt )

            {

                op++;

                t++;

                ut = t + FracDelay;

                *op = (float) ( f.generate() * fw.generate() / ( ut * ut ));

                s += *op;

            }


            op++;

            ut = t + 1 + FracDelay;


            if( ut > Len2 )

            {

                *op = 0;

            }

            else

            {

                *op = (float) ( f.generate() * fw.generate() / ( ut * ut ));

                s += *op;

            }


            s = 1.0 / s;

            t = (int) ( op - op0 + 1 );


            while( t != 0 )

            {

                *op0 = (float) ( *op0 * s );

                op0++;

                t--;

            }

        }


        static void replicateFilter( float* const p, const int kl,

            const int erp )

        {

            const float* ip = p + kl - 1;

            float* op = p + ( kl - 1 ) * erp;

            int c = kl;


            if( erp == 2 )

            {

                while( c != 0 )

                {

                    const float v = *ip;

                    op[ 0 ] = v;

                    op[ 1 ] = v;

                    ip--;

                    op -= 2;

                    c--;

                }

            }

            else

            if( erp == 3 )

            {

                while( c != 0 )

                {

                    const float v = *ip;

                    op[ 0 ] = v;

                    op[ 1 ] = v;

                    op[ 2 ] = v;

                    ip--;

                    op -= 3;

                    c--;

                }

            }

            else // erp == 4

            {

                while( c != 0 )

                {

                    const float v = *ip;

                    op[ 0 ] = v;

                    op[ 1 ] = v;

                    op[ 2 ] = v;

                    op[ 3 ] = v;

                    ip--;

                    op -= 4;

                    c--;

                }

            }

        }


    };


    struct CResizePos

    {

        const float* flt;

        intptr_t spo;

        intptr_t so;

    };


    class CResizeScanline

    {

    public:

        int padl;

        int padr;

        CResizePos* pos;


        CResizeScanline()

            : pos( nullptr )

            , poslen( 0 )

            , SrcLen( 0 )

        {

        }


        ~CResizeScanline()

        {

            delete[] pos;

        }


        void reset()

        {

            SrcLen = 0;

        }


        void update( const int SrcLen0, const int DstLen0, const double o0,

            CResizeFilters& rf, float* const sp = nullptr )

        {

            if( SrcLen0 == SrcLen && DstLen0 == DstLen && o0 == o )

            {

                return;

            }


            const int fl2m1 = rf.fl2 - 1;

            padl = fl2m1 - (int) floor( o0 );


            if( padl < 0 )

            {

                padl = 0;

            }


            // Make sure `padr` and `pos` are in sync: calculate ending `pos`

            // offset in advance.


            const double k = rf.k;


            const int DstLen_m1 = DstLen0 - 1;

            const double oe = o0 + k * DstLen_m1;

            const int ie = (int) floor( oe );


            padr = ie + rf.fl2 + 1 - SrcLen0;


            if( padr < 0 )

            {

                padr = 0;

            }


            SrcLen = 0;

            reallocBuf( pos, poslen, DstLen0 );


            const intptr_t ElCountF = rf.ElCount * (intptr_t) sizeof( float );

            const int so = padl - fl2m1;

            CResizePos* rp = pos;

            intptr_t rpso;

            int i;


            for( i = 0; i < DstLen_m1; i++ )

            {

                const double ox = o0 + k * i;

                const int ix = (int) floor( ox );


                rp -> flt = rf.getFilter( ox - ix );

                rpso = so + ix;

                rp -> spo = (intptr_t) sp + rpso * ElCountF;

                rp -> so = rpso;

                rp++;

            }


            rp -> flt = rf.getFilter( oe - ie );

            rpso = so + ie;

            rp -> spo = (intptr_t) sp + rpso * ElCountF;

            rp -> so = rpso;


            SrcLen = SrcLen0;

            DstLen = DstLen0;

            o = o0;

        }


        void updateSPO( CResizeFilters& rf, float* const sp )

        {

            const intptr_t ElCountF = rf.ElCount * (intptr_t) sizeof( float );

            CResizePos* const rp = pos;

            int i;


            for( i = 0; i < DstLen; i++ )

            {

                rp[ i ].spo = (intptr_t) sp + rp[ i ].so * ElCountF;

            }

        }


    protected:

        int poslen;

        int SrcLen;

        int DstLen;

        double o;

    };


    CResizeFilters rfv;

    CResizeFilters rfh0;

    CResizeScanline rsv;

    CResizeScanline rsh;


    template< typename T >


    static void copyScanline1v( const T* ip, const size_t ipinc, float* op,

        int cc, int repl, int repr )

    {

        float v0;


        if( repl > 0 )

        {

            v0 = (float) ip[ 0 ];


            do

            {

                op[ 0 ] = v0;

                op += 1;


            } while( --repl != 0 );

        }


        while( cc != 0 )

        {

            op[ 0 ] = (float) ip[ 0 ];

            ip += ipinc;

            op += 1;

            cc--;

        }


        if( repr > 0 )

        {

            const T* const ipe = ip - ipinc;

            v0 = (float) ipe[ 0 ];


            do

            {

                op[ 0 ] = v0;

                op += 1;


            } while( --repr != 0 );

        }

    }


    template< typename T >


    static void copyScanline2v( const T* ip, const size_t ipinc, float* op,

        int cc, int repl, int repr )

    {

        float v0, v1;


        if( repl > 0 )

        {

            v0 = (float) ip[ 0 ];

            v1 = (float) ip[ 1 ];


            do

            {

                op[ 0 ] = v0;

                op[ 1 ] = v1;

                op += 2;


            } while( --repl != 0 );

        }


        while( cc != 0 )

        {

            op[ 0 ] = (float) ip[ 0 ];

            op[ 1 ] = (float) ip[ 1 ];

            ip += ipinc;

            op += 2;

            cc--;

        }


        if( repr > 0 )

        {

            const T* const ipe = ip - ipinc;

            v0 = (float) ipe[ 0 ];

            v1 = (float) ipe[ 1 ];


            do

            {

                op[ 0 ] = v0;

                op[ 1 ] = v1;

                op += 2;


            } while( --repr != 0 );

        }

    }


    template< typename T >


    static void copyScanline3v( const T* ip, const size_t ipinc, float* op,

        int cc, int repl, int repr )

    {

        float v0, v1, v2;


        if( repl > 0 )

        {

            v0 = (float) ip[ 0 ];

            v1 = (float) ip[ 1 ];

            v2 = (float) ip[ 2 ];


            do

            {

                op[ 0 ] = v0;

                op[ 1 ] = v1;

                op[ 2 ] = v2;

                op += 3;


            } while( --repl != 0 );

        }


        while( cc != 0 )

        {

            op[ 0 ] = (float) ip[ 0 ];

            op[ 1 ] = (float) ip[ 1 ];

            op[ 2 ] = (float) ip[ 2 ];

            ip += ipinc;

            op += 3;

            cc--;

        }


        if( repr > 0 )

        {

            const T* const ipe = ip - ipinc;

            v0 = (float) ipe[ 0 ];

            v1 = (float) ipe[ 1 ];

            v2 = (float) ipe[ 2 ];


            do

            {

                op[ 0 ] = v0;

                op[ 1 ] = v1;

                op[ 2 ] = v2;

                op += 3;


            } while( --repr != 0 );

        }

    }


    template< typename T >


    static void copyScanline4v( const T* ip, const size_t ipinc, float* op,

        int cc, int repl, int repr )

    {

        float v0, v1, v2, v3;


        if( repl > 0 )

        {

            v0 = (float) ip[ 0 ];

            v1 = (float) ip[ 1 ];

            v2 = (float) ip[ 2 ];

            v3 = (float) ip[ 3 ];


            do

            {

                op[ 0 ] = v0;

                op[ 1 ] = v1;

                op[ 2 ] = v2;

                op[ 3 ] = v3;

                op += 4;


            } while( --repl != 0 );

        }


        while( cc != 0 )

        {

            op[ 0 ] = (float) ip[ 0 ];

            op[ 1 ] = (float) ip[ 1 ];

            op[ 2 ] = (float) ip[ 2 ];

            op[ 3 ] = (float) ip[ 3 ];

            ip += ipinc;

            op += 4;

            cc--;

        }


        if( repr > 0 )

        {

            const T* const ipe = ip - ipinc;

            v0 = (float) ipe[ 0 ];

            v1 = (float) ipe[ 1 ];

            v2 = (float) ipe[ 2 ];

            v3 = (float) ipe[ 3 ];


            do

            {

                op[ 0 ] = v0;

                op[ 1 ] = v1;

                op[ 2 ] = v2;

                op[ 3 ] = v3;

                op += 4;


            } while( --repr != 0 );

        }

    }


    static void padScanline1h( float* op, CResizeScanline& rs, const int l )

    {

        const float* ip = op + rs.padl;


        float v0 = ip[ 0 ];

        int i;


        for( i = 0; i < rs.padl; i++ )

        {

            op[ i ] = v0;

        }


        ip += l;

        op += rs.padl + l;


        v0 = ip[ -1 ];


        for( i = 0; i < rs.padr; i++ )

        {

            op[ i ] = v0;

        }

    }


    static void padScanline2h( float* op, CResizeScanline& rs, const int l )

    {

        const float* ip = op + rs.padl * 2;


        float v0 = ip[ 0 ];

        float v1 = ip[ 1 ];

        int i;


        for( i = 0; i < rs.padl; i++ )

        {

            op[ 0 ] = v0;

            op[ 1 ] = v1;

            op += 2;

        }


        const int lc = l * 2;

        ip += lc;

        op += lc;


        v0 = ip[ -2 ];

        v1 = ip[ -1 ];


        for( i = 0; i < rs.padr; i++ )

        {

            op[ 0 ] = v0;

            op[ 1 ] = v1;

            op += 2;

        }

    }


    static void padScanline3h( float* op, CResizeScanline& rs, const int l )

    {

        const float* ip = op + rs.padl * 3;


        float v0 = ip[ 0 ];

        float v1 = ip[ 1 ];

        float v2 = ip[ 2 ];

        int i;


        for( i = 0; i < rs.padl; i++ )

        {

            op[ 0 ] = v0;

            op[ 1 ] = v1;

            op[ 2 ] = v2;

            op += 3;

        }


        const int lc = l * 3;

        ip += lc;

        op += lc;


        v0 = ip[ -3 ];

        v1 = ip[ -2 ];

        v2 = ip[ -1 ];


        for( i = 0; i < rs.padr; i++ )

        {

            op[ 0 ] = v0;

            op[ 1 ] = v1;

            op[ 2 ] = v2;

            op += 3;

        }

    }


    static void padScanline4h( float* op, CResizeScanline& rs, const int l )

    {

        const float* ip = op + rs.padl * 4;


        float v0 = ip[ 0 ];

        float v1 = ip[ 1 ];

        float v2 = ip[ 2 ];

        float v3 = ip[ 3 ];

        int i;


        for( i = 0; i < rs.padl; i++ )

        {

            op[ 0 ] = v0;

            op[ 1 ] = v1;

            op[ 2 ] = v2;

            op[ 3 ] = v3;

            op += 4;

        }


        const int lc = l * 4;

        ip += lc;

        op += lc;


        v0 = ip[ -4 ];

        v1 = ip[ -3 ];

        v2 = ip[ -2 ];

        v3 = ip[ -1 ];


        for( i = 0; i < rs.padr; i++ )

        {

            op[ 0 ] = v0;

            op[ 1 ] = v1;

            op[ 2 ] = v2;

            op[ 3 ] = v3;

            op += 4;

        }

    }


    static inline int roundclamp( const float v, const float Clamp )

    {

        return( (int) (( v > Clamp ? Clamp : ( v < 0.0f ? 0.0f : v )) +

            0.5f ));

    }


    template< bool IsOutFloat, bool IsUnityMul, typename T >


    static void outputScanline( const float* ip, T* op, int l,

        const float Clamp, const float OutMul )

    {

        if( IsOutFloat )

        {

            if( IsUnityMul )

            {

                if( sizeof( op[ 0 ]) == sizeof( ip[ 0 ]))

                {

                    memcpy( op, ip, (size_t) l * sizeof( op[ 0 ]));

                }

                else

                {

                    int l4 = l >> 2;

                    l &= 3;


                    while( l4 != 0 )

                    {

                        op[ 0 ] = (T) ip[ 0 ];

                        op[ 1 ] = (T) ip[ 1 ];

                        op[ 2 ] = (T) ip[ 2 ];

                        op[ 3 ] = (T) ip[ 3 ];

                        ip += 4;

                        op += 4;

                        l4--;

                    }


                    while( l != 0 )

                    {

                        *op = (T) *ip;

                        ip++;

                        op++;

                        l--;

                    }

                }

            }

            else

            {

                int l4 = l >> 2;

                l &= 3;

                bool DoScalar = true;


                if( sizeof( op[ 0 ]) == sizeof( ip[ 0 ]))

                {

                #if LANCIR_ALIGN > 4


                    DoScalar = false;

                    const lancvec_t om = lancvec_load32_splat( &OutMul );


                    while( l4 != 0 )

                    {

                        lancvec_storeu( (float*) op,

                            lancvec_mul( lancvec_load( ip ), om ));


                        ip += 4;

                        op += 4;

                        l4--;

                    }


                #endif // LANCIR_ALIGN > 4

                }


                if( DoScalar )

                {

                    while( l4 != 0 )

                    {

                        op[ 0 ] = (T) ( ip[ 0 ] * OutMul );

                        op[ 1 ] = (T) ( ip[ 1 ] * OutMul );

                        op[ 2 ] = (T) ( ip[ 2 ] * OutMul );

                        op[ 3 ] = (T) ( ip[ 3 ] * OutMul );

                        ip += 4;

                        op += 4;

                        l4--;

                    }

                }


                while( l != 0 )

                {

                    *op = (T) ( *ip * OutMul );

                    ip++;

                    op++;

                    l--;

                }

            }

        }

        else

        {

            int l4 = l >> 2;

            l &= 3;


        #if LANCIR_ALIGN > 4


            const lancvec_t minv = lancvec_const_splat( 0.0f );

            const lancvec_t maxv = lancvec_load32_splat( &Clamp );

            const lancvec_t om = lancvec_load32_splat( &OutMul );


            #if defined( LANCIR_SSE2 )

                unsigned int prevrm = _MM_GET_ROUNDING_MODE();

                _MM_SET_ROUNDING_MODE( _MM_ROUND_NEAREST );

            #else // defined( LANCIR_SSE2 )

                const lancvec_t v05 = lancvec_const_splat( 0.5f );

            #endif // defined( LANCIR_SSE2 )


            if( sizeof( op[ 0 ]) == 4 )

            {

                while( l4 != 0 )

                {

                    const lancvec_t v = lancvec_load( ip );

                    const lancvec_t cv = lancvec_max( lancvec_min(

                        ( IsUnityMul ? v : lancvec_mul( v, om )),

                        maxv ), minv );


                #if defined( LANCIR_SSE2 )


                    _mm_storeu_si128( (__m128i*) op, _mm_cvtps_epi32( cv ));


                #elif defined( LANCIR_NEON )


                    vst1q_u32( (unsigned int*) op, vcvtq_u32_f32( vaddq_f32(

                        cv, v05 )));


                #elif defined( LANCIR_WASM )


                    wasm_v128_store( op, wasm_i32x4_trunc_sat_f32x4(

                        wasm_f32x4_add( cv, v05 )));


                #endif // defined( LANCIR_WASM )


                    ip += 4;

                    op += 4;

                    l4--;

                }

            }

            else

            if( sizeof( op[ 0 ]) == 2 )

            {

                while( l4 != 0 )

                {

                    const lancvec_t v = lancvec_load( ip );

                    const lancvec_t cv = lancvec_max( lancvec_min(

                        ( IsUnityMul ? v : lancvec_mul( v, om )),

                        maxv ), minv );


                #if defined( LANCIR_SSE2 )


                    const __m128i v32 = _mm_cvtps_epi32( cv );

                    const __m128i v16s = _mm_shufflehi_epi16(

                        _mm_shufflelo_epi16( v32, 0 | 2 << 2 ), 0 | 2 << 2 );


                    const __m128i v16 = _mm_shuffle_epi32( v16s, 0 | 2 << 2 );


                    __m128i tmp;

                    _mm_store_si128( &tmp, v16 );

                    memcpy( op, &tmp, 8 );


                #elif defined( LANCIR_NEON )


                    const uint32x4_t v32 = vcvtq_u32_f32(

                        vaddq_f32( cv, v05 ));


                    const uint16x4_t v16 = vmovn_u32( v32 );


                    vst1_u16( (unsigned short*) op, v16 );


                #elif defined( LANCIR_WASM )


                    const v128_t v32 = wasm_i32x4_trunc_sat_f32x4(

                        wasm_f32x4_add( cv, v05 ));


                    wasm_v128_store64_lane( op,

                        wasm_u16x8_narrow_i32x4( v32, v32 ), 0 );


                #endif // defined( LANCIR_WASM )


                    ip += 4;

                    op += 4;

                    l4--;

                }

            }

            else

            {

                while( l4 != 0 )

                {

                    const lancvec_t v = lancvec_load( ip );

                    const lancvec_t cv = lancvec_max( lancvec_min(

                        ( IsUnityMul ? v : lancvec_mul( v, om )),

                        maxv ), minv );


                #if defined( LANCIR_SSE2 )


                    const __m128i v32 = _mm_cvtps_epi32( cv );

                    const __m128i v16s = _mm_shufflehi_epi16(

                        _mm_shufflelo_epi16( v32, 0 | 2 << 2 ), 0 | 2 << 2 );


                    const __m128i v16 = _mm_shuffle_epi32( v16s, 0 | 2 << 2 );

                    const __m128i v8 = _mm_packus_epi16( v16, v16 );


                    *(int*) op = _mm_cvtsi128_si32( v8 );


                #elif defined( LANCIR_NEON )


                    const uint32x4_t v32 = vcvtq_u32_f32(

                        vaddq_f32( cv, v05 ));


                    const uint16x4_t v16 = vmovn_u32( v32 );

                    const uint8x8_t v8 = vmovn_u16( vcombine_u16( v16, v16 ));


                    *(unsigned int*) op = vget_lane_u32( (uint32x2_t) v8, 0 );


                #elif defined( LANCIR_WASM )


                    const v128_t v32 = wasm_i32x4_trunc_sat_f32x4(

                        wasm_f32x4_add( cv, v05 ));


                    const v128_t v16 = wasm_u16x8_narrow_i32x4( v32, v32 );


                    wasm_v128_store32_lane( op,

                        wasm_u8x16_narrow_i16x8( v16, v16 ), 0 );


                #endif // defined( LANCIR_WASM )


                    ip += 4;

                    op += 4;

                    l4--;

                }

            }


            #if defined( LANCIR_SSE2 )

                _MM_SET_ROUNDING_MODE( prevrm );

            #endif // defined( LANCIR_SSE2 )


        #else // LANCIR_ALIGN > 4


            if( IsUnityMul )

            {

                while( l4 != 0 )

                {

                    op[ 0 ] = (T) roundclamp( ip[ 0 ], Clamp );

                    op[ 1 ] = (T) roundclamp( ip[ 1 ], Clamp );

                    op[ 2 ] = (T) roundclamp( ip[ 2 ], Clamp );

                    op[ 3 ] = (T) roundclamp( ip[ 3 ], Clamp );

                    ip += 4;

                    op += 4;

                    l4--;

                }

            }

            else

            {

                while( l4 != 0 )

                {

                    op[ 0 ] = (T) roundclamp( ip[ 0 ] * OutMul, Clamp );

                    op[ 1 ] = (T) roundclamp( ip[ 1 ] * OutMul, Clamp );

                    op[ 2 ] = (T) roundclamp( ip[ 2 ] * OutMul, Clamp );

                    op[ 3 ] = (T) roundclamp( ip[ 3 ] * OutMul, Clamp );

                    ip += 4;

                    op += 4;

                    l4--;

                }

            }


        #endif // LANCIR_ALIGN > 4


            if( IsUnityMul )

            {

                while( l != 0 )

                {

                    *op = (T) roundclamp( *ip, Clamp );

                    ip++;

                    op++;

                    l--;

                }

            }

            else

            {

                while( l != 0 )

                {

                    *op = (T) roundclamp( *ip * OutMul, Clamp );

                    ip++;

                    op++;

                    l--;

                }

            }

        }

    }


    #define LANCIR_LF_PRE \

            const CResizePos* const rpe = rp + DstLen; \

            while( rp != rpe ) \

            { \

                const float* flt = rp -> flt; \

                const float* ip; \

                if( UseSP ) \

                { \

                    ip = (const float*) ( (intptr_t) sp + rp -> spo ); \

                } \

                else \

                { \

                    ip = (const float*) rp -> spo; \

                }


    #define LANCIR_LF_POST \

                op += opinc; \

                rp++; \

            }


    template< bool UseSP >


    static void resize1( const float* const sp, float* op, const size_t opinc,

        const CResizePos* rp, const int kl, const int DstLen )

    {

        const int ci = kl >> 2;


        if(( kl & 3 ) == 0 )

        {

            LANCIR_LF_PRE


            int c = ci;


        #if LANCIR_ALIGN > 4


            lancvec_t sum = lancvec_mul(

                lancvec_load( flt ), lancvec_loadu( ip ));


            while( --c != 0 )

            {

                flt += 4;

                ip += 4;

                sum = lancvec_madd( sum, lancvec_load( flt ),

                    lancvec_loadu( ip ));

            }


            lancvec_store32_hadd( op, sum );


        #else // LANCIR_ALIGN > 4


            float sum0 = flt[ 0 ] * ip[ 0 ];

            float sum1 = flt[ 1 ] * ip[ 1 ];

            float sum2 = flt[ 2 ] * ip[ 2 ];

            float sum3 = flt[ 3 ] * ip[ 3 ];


            while( --c != 0 )

            {

                flt += 4;

                ip += 4;

                sum0 += flt[ 0 ] * ip[ 0 ];

                sum1 += flt[ 1 ] * ip[ 1 ];

                sum2 += flt[ 2 ] * ip[ 2 ];

                sum3 += flt[ 3 ] * ip[ 3 ];

            }


            op[ 0 ] = ( sum0 + sum1 ) + ( sum2 + sum3 );


        #endif // LANCIR_ALIGN > 4


            LANCIR_LF_POST

        }

        else

        {

            LANCIR_LF_PRE


            int c = ci;


        #if LANCIR_ALIGN > 4


            lancvec_t sum = lancvec_mul( lancvec_load( flt ),

                lancvec_loadu( ip ));


            while( --c != 0 )

            {

                flt += 4;

                ip += 4;

                sum = lancvec_madd( sum, lancvec_load( flt ),

                    lancvec_loadu( ip ));

            }


            #if defined( LANCIR_NEON )


                float32x2_t sum2 = vadd_f32( vget_high_f32( sum ),

                    vget_low_f32( sum ));


                sum2 = vmla_f32( sum2, vld1_f32( flt + 4 ),

                    vld1_f32( ip + 4 ));


                #if defined( LANCIR_ARM32 )

                    op[ 0 ] = vget_lane_f32( sum2, 0 ) +

                        vget_lane_f32( sum2, 1 );

                #else // defined( LANCIR_ARM32 )

                    op[ 0 ] = vaddv_f32( sum2 );

                #endif // defined( LANCIR_ARM32 )


            #else // defined( LANCIR_NEON )


                const lancvec_t sum2 = lancvec_mul( lancvec_loadu( flt + 2 ),

                    lancvec_loadu( ip + 2 ));


                sum = lancvec_addhl( sum, sum );

                sum = lancvec_addhl( sum, sum2 );


                lancvec_store32_addhl( op, sum );


            #endif // defined( LANCIR_NEON )


        #else // LANCIR_ALIGN > 4


            float sum0 = flt[ 0 ] * ip[ 0 ];

            float sum1 = flt[ 1 ] * ip[ 1 ];

            float sum2 = flt[ 2 ] * ip[ 2 ];

            float sum3 = flt[ 3 ] * ip[ 3 ];


            while( --c != 0 )

            {

                flt += 4;

                ip += 4;

                sum0 += flt[ 0 ] * ip[ 0 ];

                sum1 += flt[ 1 ] * ip[ 1 ];

                sum2 += flt[ 2 ] * ip[ 2 ];

                sum3 += flt[ 3 ] * ip[ 3 ];

            }


            op[ 0 ] = ( sum0 + sum1 ) + ( sum2 + sum3 ) +

                flt[ 4 ] * ip[ 4 ] + flt[ 5 ] * ip[ 5 ];


        #endif // LANCIR_ALIGN > 4


            LANCIR_LF_POST

        }

    }


    template< bool UseSP >


    static void resize2( const float* const sp, float* op, const size_t opinc,

        const CResizePos* rp, const int kl, const int DstLen )

    {

    #if LANCIR_ALIGN > 4

        const int ci = kl >> 2;

        const int cir = kl & 3;

    #else // LANCIR_ALIGN > 4

        const int ci = kl >> 1;

    #endif // LANCIR_ALIGN > 4


        LANCIR_LF_PRE


        int c = ci;


    #if defined( LANCIR_AVX )


        __m256 sum = _mm256_mul_ps( _mm256_load_ps( flt ),

            _mm256_loadu_ps( ip ));


        while( --c != 0 )

        {

            flt += 8;

            ip += 8;

            sum = _mm256_add_ps( sum, _mm256_mul_ps( _mm256_load_ps( flt ),

                _mm256_loadu_ps( ip )));

        }


        __m128 res = _mm_add_ps( _mm256_extractf128_ps( sum, 0 ),

            _mm256_extractf128_ps( sum, 1 ));


        if( cir == 2 )

        {

            res = _mm_add_ps( res, _mm_mul_ps( _mm_load_ps( flt + 8 ),

                _mm_loadu_ps( ip + 8 )));

        }


        _mm_storel_pi( (__m64*) op,

            _mm_add_ps( res, _mm_movehl_ps( res, res )));


    #elif LANCIR_ALIGN > 4


        lancvec_t sumA = lancvec_mul(

            lancvec_load( flt ), lancvec_loadu( ip ));


        lancvec_t sumB = lancvec_mul(

            lancvec_load( flt + 4 ), lancvec_loadu( ip + 4 ));


        while( --c != 0 )

        {

            flt += 8;

            ip += 8;

            sumA = lancvec_madd( sumA, lancvec_load( flt ),

                lancvec_loadu( ip ));


            sumB = lancvec_madd( sumB, lancvec_load( flt + 4 ),

                lancvec_loadu( ip + 4 ));

        }


        sumA = lancvec_add( sumA, sumB );


        if( cir == 2 )

        {

            sumA = lancvec_madd( sumA, lancvec_load( flt + 8 ),

                lancvec_loadu( ip + 8 ));

        }


        lancvec_store64_addhl( op, sumA );


    #else // LANCIR_ALIGN > 4


        const float xx = flt[ 0 ];

        const float xx2 = flt[ 1 ];

        float sum0 = xx * ip[ 0 ];

        float sum1 = xx * ip[ 1 ];

        float sum2 = xx2 * ip[ 2 ];

        float sum3 = xx2 * ip[ 3 ];


        while( --c != 0 )

        {

            flt += 2;

            ip += 4;

            const float xx = flt[ 0 ];

            const float xx2 = flt[ 1 ];

            sum0 += xx * ip[ 0 ];

            sum1 += xx * ip[ 1 ];

            sum2 += xx2 * ip[ 2 ];

            sum3 += xx2 * ip[ 3 ];

        }


        op[ 0 ] = sum0 + sum2;

        op[ 1 ] = sum1 + sum3;


    #endif // LANCIR_ALIGN > 4


        LANCIR_LF_POST

    }


    template< bool UseSP >


    static void resize3( const float* const sp, float* op, const size_t opinc,

        const CResizePos* rp, const int kl, const int DstLen )

    {

    #if LANCIR_ALIGN > 4


        const int ci = kl >> 2;

        const int cir = kl & 3;


        LANCIR_LF_PRE


        float res[ 12 ];

        int c = ci;


    #if defined( LANCIR_AVX )


        __m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));

        __m256 sumB = _mm256_mul_ps( _mm256_loadu_ps( flt + 4 ),

            _mm256_loadu_ps( ip + 4 ));


        while( --c != 0 )

        {

            flt += 12;

            ip += 12;

            sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ),

                _mm_loadu_ps( ip )));


            sumB = _mm256_add_ps( sumB, _mm256_mul_ps(

                _mm256_loadu_ps( flt + 4 ), _mm256_loadu_ps( ip + 4 )));

        }


        if( cir == 2 )

        {

            sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt + 12 ),

                _mm_loadu_ps( ip + 12 )));

        }


        _mm_storeu_ps( res, sumA );


        float o0 = res[ 0 ] + res[ 3 ];

        float o1 = res[ 1 ];

        float o2 = res[ 2 ];


        _mm256_storeu_ps( res + 4, sumB );


        o1 += res[ 4 ];

        o2 += res[ 5 ];


    #else // defined( LANCIR_AVX )


        lancvec_t sumA = lancvec_mul( lancvec_load( flt ),

            lancvec_loadu( ip ));


        lancvec_t sumB = lancvec_mul( lancvec_load( flt + 4 ),

            lancvec_loadu( ip + 4 ));


        lancvec_t sumC = lancvec_mul( lancvec_load( flt + 8 ),

            lancvec_loadu( ip + 8 ));


        while( --c != 0 )

        {

            flt += 12;

            ip += 12;

            sumA = lancvec_madd( sumA, lancvec_load( flt ),

                lancvec_loadu( ip ));


            sumB = lancvec_madd( sumB, lancvec_load( flt + 4 ),

                lancvec_loadu( ip + 4 ));


            sumC = lancvec_madd( sumC, lancvec_load( flt + 8 ),

                lancvec_loadu( ip + 8 ));

        }


        if( cir == 2 )

        {

            sumA = lancvec_madd( sumA, lancvec_load( flt + 12 ),

                lancvec_loadu( ip + 12 ));

        }


        lancvec_storeu( res, sumA );

        lancvec_storeu( res + 4, sumB );


        float o0 = res[ 0 ] + res[ 3 ];

        float o1 = res[ 1 ] + res[ 4 ];

        float o2 = res[ 2 ] + res[ 5 ];


        lancvec_storeu( res + 8, sumC );


    #endif // defined( LANCIR_AVX )


        o0 += res[ 6 ] + res[ 9 ];

        o1 += res[ 7 ] + res[ 10 ];

        o2 += res[ 8 ] + res[ 11 ];


        if( cir == 2 )

        {

            o1 += flt[ 16 ] * ip[ 16 ];

            o2 += flt[ 17 ] * ip[ 17 ];

        }


        op[ 0 ] = o0;

        op[ 1 ] = o1;

        op[ 2 ] = o2;


    #else // LANCIR_ALIGN > 4


        const int ci = kl >> 1;


        LANCIR_LF_PRE


        int c = ci;


        const float xx = flt[ 0 ];

        float sum0 = xx * ip[ 0 ];

        float sum1 = xx * ip[ 1 ];

        float sum2 = xx * ip[ 2 ];

        const float xx2 = flt[ 1 ];

        float sum3 = xx2 * ip[ 3 ];

        float sum4 = xx2 * ip[ 4 ];

        float sum5 = xx2 * ip[ 5 ];


        while( --c != 0 )

        {

            flt += 2;

            ip += 6;

            const float xx = flt[ 0 ];

            sum0 += xx * ip[ 0 ];

            sum1 += xx * ip[ 1 ];

            sum2 += xx * ip[ 2 ];

            const float xx2 = flt[ 1 ];

            sum3 += xx2 * ip[ 3 ];

            sum4 += xx2 * ip[ 4 ];

            sum5 += xx2 * ip[ 5 ];

        }


        op[ 0 ] = sum0 + sum3;

        op[ 1 ] = sum1 + sum4;

        op[ 2 ] = sum2 + sum5;


    #endif // LANCIR_ALIGN > 4


        LANCIR_LF_POST

    }


    template< bool UseSP >


    static void resize4( const float* const sp, float* op, const size_t opinc,

        const CResizePos* rp, const int kl, const int DstLen )

    {

    #if LANCIR_ALIGN > 4

        const int ci = kl >> 1;

    #else // LANCIR_ALIGN > 4

        const int ci = kl;

    #endif // LANCIR_ALIGN > 4


        LANCIR_LF_PRE


        int c = ci;


    #if defined( LANCIR_AVX )


        __m256 sum = _mm256_mul_ps( _mm256_load_ps( flt ),

            _mm256_loadu_ps( ip ));


        while( --c != 0 )

        {

            flt += 8;

            ip += 8;

            sum = _mm256_add_ps( sum, _mm256_mul_ps( _mm256_load_ps( flt ),

                _mm256_loadu_ps( ip )));

        }


        _mm_store_ps( op, _mm_add_ps( _mm256_extractf128_ps( sum, 0 ),

            _mm256_extractf128_ps( sum, 1 )));


    #elif LANCIR_ALIGN > 4


        lancvec_t sumA = lancvec_mul( lancvec_load( flt ),

            lancvec_load( ip ));


        lancvec_t sumB = lancvec_mul( lancvec_load( flt + 4 ),

            lancvec_load( ip + 4 ));


        while( --c != 0 )

        {

            flt += 8;

            ip += 8;

            sumA = lancvec_madd( sumA, lancvec_load( flt ),

                lancvec_load( ip ));


            sumB = lancvec_madd( sumB, lancvec_load( flt + 4 ),

                lancvec_load( ip + 4 ));

        }


        lancvec_store( op, lancvec_add( sumA, sumB ));


    #else // LANCIR_ALIGN > 4


        const float xx = flt[ 0 ];

        float sum0 = xx * ip[ 0 ];

        float sum1 = xx * ip[ 1 ];

        float sum2 = xx * ip[ 2 ];

        float sum3 = xx * ip[ 3 ];


        while( --c != 0 )

        {

            flt++;

            ip += 4;

            const float xx = flt[ 0 ];

            sum0 += xx * ip[ 0 ];

            sum1 += xx * ip[ 1 ];

            sum2 += xx * ip[ 2 ];

            sum3 += xx * ip[ 3 ];

        }


        op[ 0 ] = sum0;

        op[ 1 ] = sum1;

        op[ 2 ] = sum2;

        op[ 3 ] = sum3;


    #endif // LANCIR_ALIGN > 4


        LANCIR_LF_POST

    }


    #undef LANCIR_LF_PRE

    #undef LANCIR_LF_POST

};


#undef lancvec_t

#undef lancvec_const_splat

#undef lancvec_load32_splat

#undef lancvec_load

#undef lancvec_loadu

#undef lancvec_store

#undef lancvec_storeu

#undef lancvec_add

#undef lancvec_mul

#undef lancvec_min

#undef lancvec_max

#undef lancvec_madd

#undef lancvec_addhl

#undef lancvec_store32_addhl

#undef lancvec_store32_hadd

#undef lancvec_store64_addhl


#if defined( LANCIR_NULLPTR )

    #undef nullptr

    #undef LANCIR_NULLPTR

#endif // defined( LANCIR_NULLPTR )


} // namespace avir


#endif // AVIR_CLANCIR_INCLUDED

LANCIR_ALIGN
#define LANCIR_ALIGN
Address alignment (granularity) used by resizing functions, in bytes.
Definition lancir.h:127

LANCIR_LF_POST
#define LANCIR_LF_POST
Scanline resize function epilogue.
Definition lancir.h:2083

LANCIR_LF_PRE
#define LANCIR_LF_PRE
Scanline resize function prologue.
Definition lancir.h:2063

avir::CLancIRParams
LANCIR resizing parameters class.
Definition lancir.h:261

avir::CLancIRParams::CLancIRParams
CLancIRParams(const int aSrcSSize=0, const int aNewSSize=0, const double akx=0.0, const double aky=0.0, const double aox=0.0, const double aoy=0.0)
Default constructor, with optional arguments that correspond to class variables.
Definition lancir.h:295

avir::CLancIRParams::oy
double oy
Start Y pixel offset within the source image, can be negative. A positive offset moves the image to t...
Definition lancir.h:278

avir::CLancIRParams::ky
double ky
Resizing step - vertical. Same as kx.
Definition lancir.h:275

avir::CLancIRParams::la
double la
Lanczos window function's a parameter, greater or equal to 2.0.
Definition lancir.h:280

avir::CLancIRParams::NewSSize
int NewSSize
Physical size of the destination scanline, in elements (not bytes). If this value is below 1,...
Definition lancir.h:266

avir::CLancIRParams::ox
double ox
Start X pixel offset within the source image, can be negative. A positive offset moves the image to t...
Definition lancir.h:276

avir::CLancIRParams::SrcSSize
int SrcSSize
Physical size of the source scanline, in elements (not bytes). If this value is below 1,...
Definition lancir.h:263

avir::CLancIRParams::kx
double kx
Resizing step - horizontal (one output pixel corresponds to k input pixels). A downsizing factor if g...
Definition lancir.h:269

avir::CLancIR::copyScanline1v
static void copyScanline1v(const T *ip, const size_t ipinc, float *op, int cc, int repl, int repr)
Scanline copying function, for vertical resizing.
Definition lancir.h:1407

avir::CLancIR::FltBuf
float * FltBuf
Address-aligned FltBuf0.
Definition lancir.h:760

avir::CLancIR::resizeImage
int resizeImage(const Tin *const SrcBuf, const int SrcWidth, const int SrcHeight, const int SrcSSize, Tout *const NewBuf, const int NewWidth, const int NewHeight, const int NewSSize, const int ElCount, const double kx0=0.0, const double ky0=0.0, double ox=0.0, double oy=0.0)
Legacy image resizing function.
Definition lancir.h:745

avir::CLancIR::resize3
static void resize3(const float *const sp, float *op, const size_t opinc, const CResizePos *rp, const int kl, const int DstLen)
Function performs scanline resizing. Variants for 1-4-channel images.
Definition lancir.h:2323

avir::CLancIR::resize2
static void resize2(const float *const sp, float *op, const size_t opinc, const CResizePos *rp, const int kl, const int DstLen)
Function performs scanline resizing. Variants for 1-4-channel images.
Definition lancir.h:2225

avir::CLancIR::rsv
CResizeScanline rsv
Vertical resize scanline.
Definition lancir.h:1386

avir::CLancIR::padScanline2h
static void padScanline2h(float *op, CResizeScanline &rs, const int l)
Scanline padding function, for horizontal resizing.
Definition lancir.h:1634

avir::CLancIR::resize4
static void resize4(const float *const sp, float *op, const size_t opinc, const CResizePos *rp, const int kl, const int DstLen)
Function performs scanline resizing. Variants for 1-4-channel images.
Definition lancir.h:2467

avir::CLancIR::roundclamp
static int roundclamp(const float v, const float Clamp)
Rounds a value, and applies clamping.
Definition lancir.h:1746

avir::CLancIR::rfh0
CResizeFilters rfh0
Resizing filters for horizontal resizing (may not be in use).
Definition lancir.h:1384

avir::CLancIR::spv0
float * spv0
Scanline buffer for vertical resizing, also used at the output stage.
Definition lancir.h:761

avir::CLancIR::outputScanline
static void outputScanline(const float *ip, T *op, int l, const float Clamp, const float OutMul)
Scanline output function.
Definition lancir.h:1773

avir::CLancIR::rfv
CResizeFilters rfv
Resizing filters for vertical resizing.
Definition lancir.h:1383

avir::CLancIR::copyScanline4v
static void copyScanline4v(const T *ip, const size_t ipinc, float *op, int cc, int repl, int repr)
Scanline copying function, for vertical resizing.
Definition lancir.h:1542

avir::CLancIR::padScanline1h
static void padScanline1h(float *op, CResizeScanline &rs, const int l)
Scanline padding function, for horizontal resizing.
Definition lancir.h:1611

avir::CLancIR::padScanline4h
static void padScanline4h(float *op, CResizeScanline &rs, const int l)
Scanline padding function, for horizontal resizing.
Definition lancir.h:1698

avir::CLancIR::FltBuf0
float * FltBuf0
Intermediate resizing buffer.
Definition lancir.h:758

avir::CLancIR::copyScanline2v
static void copyScanline2v(const T *ip, const size_t ipinc, float *op, int cc, int repl, int repr)
Scanline copying function, for vertical resizing.
Definition lancir.h:1447

avir::CLancIR::resizeImage
int resizeImage(const Tin *const SrcBuf, const int SrcWidth, const int SrcHeight, Tout *const NewBuf, const int NewWidth, const int NewHeight, const int ElCount, const CLancIRParams *const aParams=nullptr)
Function resizes an image.
Definition lancir.h:387

avir::CLancIR::FltBuf0Len
size_t FltBuf0Len
Length of FltBuf0.
Definition lancir.h:759

avir::CLancIR::spv0len
int spv0len
Length of spv0.
Definition lancir.h:763

avir::CLancIR::reallocBuf
static void reallocBuf(Tb *&buf, Tl &len, const Tl newlen)
Typed buffer reallocation function.
Definition lancir.h:818

avir::CLancIR::copyScanline3v
static void copyScanline3v(const T *ip, const size_t ipinc, float *op, int cc, int repl, int repr)
Scanline copying function, for vertical resizing.
Definition lancir.h:1492

avir::CLancIR::rsh
CResizeScanline rsh
Horizontal resize scanline.
Definition lancir.h:1387

avir::CLancIR::resize1
static void resize1(const float *const sp, float *op, const size_t opinc, const CResizePos *rp, const int kl, const int DstLen)
Function performs scanline resizing. Variants for 1-4-channel images.
Definition lancir.h:2103

avir::CLancIR::padScanline3h
static void padScanline3h(float *op, CResizeScanline &rs, const int l)
Scanline padding function, for horizontal resizing.
Definition lancir.h:1664

avir::CLancIR::spv
float * spv
Address-aligned spv0.
Definition lancir.h:764

avir::CLancIR::reallocBuf
static void reallocBuf(Tb *&buf0, Tb *&buf, Tl &len, Tl newlen)
Typed buffer reallocation function, with address alignment.
Definition lancir.h:783

avir::CLancIR::CResizeFilters
Class for fractional delay filter bank storage and calculation.
Definition lancir.h:841

avir::CLancIR::CResizeFilters::Bufs
float * Bufs[BufCount]
Address-aligned Bufs0.
Definition lancir.h:988

avir::CLancIR::CResizeFilters::CurBufFill
int CurBufFill
The number of fractional positions filled in the current filter buffer.
Definition lancir.h:990

avir::CLancIR::CResizeFilters::getFilter
const float * getFilter(const double x)
Filter acquisition function.
Definition lancir.h:940

avir::CLancIR::CResizeFilters::Len2
double Len2
Half resampling filter's length, unrounded.
Definition lancir.h:972

avir::CLancIR::CResizeFilters::k
double k
Current k.
Definition lancir.h:997

avir::CLancIR::CResizeFilters::CurBuf
int CurBuf
Filter buffer currently being filled.
Definition lancir.h:989

avir::CLancIR::CResizeFilters::ElCount
int ElCount
Current ElCount.
Definition lancir.h:998

avir::CLancIR::CResizeFilters::BufLen
static const int BufLen
The number of fractional filters a single buffer (filter batch) may contain. Both the BufLen and BufC...
Definition lancir.h:981

avir::CLancIR::CResizeFilters::ElRepl
int ElRepl
The number of repetitions of each filter tap.
Definition lancir.h:978

avir::CLancIR::CResizeFilters::update
bool update(const double la0, const double k0, const int ElCount0)
Function updates the filter bank.
Definition lancir.h:882

avir::CLancIR::CResizeFilters::BufCount
static const int BufCount
The maximal number of buffers (filter batches) that can be in use.
Definition lancir.h:979

avir::CLancIR::CResizeFilters::makeFilterNorm
void makeFilterNorm(float *op, const double FracDelay) const
Filter calculation function.
Definition lancir.h:1076

avir::CLancIR::CResizeFilters::setBuf
void setBuf(const int bi)
Current buffer (filter batch) repositioning function.
Definition lancir.h:1009

avir::CLancIR::CResizeFilters::KernelLenA
int KernelLenA
SIMD-aligned and replicated filter kernel's length.
Definition lancir.h:976

avir::CLancIR::CResizeFilters::replicateFilter
static void replicateFilter(float *const p, const int kl, const int erp)
Filter tap replication function, for SIMD operations.
Definition lancir.h:1171

avir::CLancIR::CResizeFilters::la
double la
Current la.
Definition lancir.h:996

avir::CLancIR::CResizeFilters::Filters
float ** Filters
Fractional delay filters for all positions. A particular pointer equals nullptr, if a filter for such...
Definition lancir.h:992

avir::CLancIR::CResizeFilters::FreqA
double FreqA
Circular frequency of the window function.
Definition lancir.h:971

avir::CLancIR::CResizeFilters::fl2
int fl2
Half resampling filter's length, integer.
Definition lancir.h:973

avir::CLancIR::CResizeFilters::Bufs0
float * Bufs0[BufCount]
Buffers that hold all filters, original.
Definition lancir.h:984

avir::CLancIR::CResizeFilters::FiltersLen
int FiltersLen
Allocated length of Filters, in elements.
Definition lancir.h:995

avir::CLancIR::CResizeFilters::Freq
double Freq
Circular frequency of the filter.
Definition lancir.h:970

avir::CLancIR::CResizeFilters::FracCount
int FracCount
The number of fractional positions for which filters can be created.
Definition lancir.h:974

avir::CLancIR::CResizeFilters::KernelLen
int KernelLen
Resampling filter kernel's length, taps. Available after the update() function call....
Definition lancir.h:845

avir::CLancIR::CResizeFilters::Bufs0Len
int Bufs0Len[BufCount]
Allocated lengthes in Bufs0, in float elements.
Definition lancir.h:986

avir::CLancIR::CResizeFilters::CSineGen
Sine-wave signal generator class.
Definition lancir.h:1027

avir::CLancIR::CResizeFilters::CSineGen::CSineGen
CSineGen(const double si, const double ph)
Constructor initializes this sine-wave signal generator.
Definition lancir.h:1038

avir::CLancIR::CResizeFilters::CSineGen::generate
double generate()
Generates the next sine-wave sample, without biasing.
Definition lancir.h:1049

avir::CLancIR::CResizePos
Structure defines source scanline positions and filters for each destination pixel.
Definition lancir.h:1227

avir::CLancIR::CResizePos::so
intptr_t so
Offset within the source scanline, in pixels.
Definition lancir.h:1231

avir::CLancIR::CResizePos::spo
intptr_t spo
Source scanline's pixel offset, in bytes, or a direct pointer to scanline buffer.
Definition lancir.h:1229

avir::CLancIR::CResizePos::flt
const float * flt
Fractional delay filter.
Definition lancir.h:1228

avir::CLancIR::CResizeScanline
Scanline resizing positions class.
Definition lancir.h:1243

avir::CLancIR::CResizeScanline::SrcLen
int SrcLen
Current SrcLen.
Definition lancir.h:1378

avir::CLancIR::CResizeScanline::poslen
int poslen
Allocated pos buffer's length.
Definition lancir.h:1377

avir::CLancIR::CResizeScanline::update
void update(const int SrcLen0, const int DstLen0, const double o0, CResizeFilters &rf, float *const sp=nullptr)
Scanline positions update function.
Definition lancir.h:1290

avir::CLancIR::CResizeScanline::padl
int padl
Left-padding (in pixels) required for source scanline.
Definition lancir.h:1245

avir::CLancIR::CResizeScanline::pos
CResizePos * pos
Source scanline positions (offsets) and filters for each destination pixel position.
Definition lancir.h:1247

avir::CLancIR::CResizeScanline::reset
void reset()
Object's reset function.
Definition lancir.h:1270

avir::CLancIR::CResizeScanline::padr
int padr
Right-padding (in pixels) required for source scanline.
Definition lancir.h:1246

avir::CLancIR::CResizeScanline::o
double o
Current o.
Definition lancir.h:1380

avir::CLancIR::CResizeScanline::updateSPO
void updateSPO(CResizeFilters &rf, float *const sp)
Scanline pixel offsets update function.
Definition lancir.h:1364

avir::CLancIR::CResizeScanline::DstLen
int DstLen
Current DstLen.
Definition lancir.h:1379