218 CLancIR(
const CLancIR& )
223 CLancIR& operator = (
const CLancIR& )
271 template<
typename Tin,
typename Tout >
273 const int SrcHeight, Tout*
const NewBuf,
const int NewWidth,
274 const int NewHeight,
const int ElCount,
277 if(( SrcWidth < 0 ) | ( SrcHeight < 0 ) | ( NewWidth <= 0 ) |
278 ( NewHeight <= 0 ) | ( SrcBuf == NULL ) | ( NewBuf == NULL ) |
279 ( (
const void*) SrcBuf == (
const void*) NewBuf ))
286 *aParams : DefParams );
288 if( Params.
la < 2.0 )
293 const int OutSLen = NewWidth * ElCount;
294 const size_t NewScanlineSize = ( Params.
NewSSize < 1 ?
297 if(( SrcWidth == 0 ) | ( SrcHeight == 0 ))
302 for( i = 0; i < NewHeight; i++ )
304 memset( op, 0, OutSLen *
sizeof( Tout ));
305 op += NewScanlineSize;
311 const size_t SrcScanlineSize = ( Params.
SrcSSize < 1 ?
312 SrcWidth * ElCount : Params.
SrcSSize );
314 double ox = Params.
ox;
315 double oy = Params.
oy;
319 if( Params.
kx >= 0.0 )
321 kx = ( Params.
kx == 0.0 ?
322 (double) SrcWidth / NewWidth : Params.
kx );
324 ox += ( kx - 1.0 ) * 0.5;
331 if( Params.
ky >= 0.0 )
333 ky = ( Params.
ky == 0.0 ?
334 (double) SrcHeight / NewHeight : Params.
ky );
336 oy += ( ky - 1.0 ) * 0.5;
343 if(
rfv.update( Params.
la, ky, ElCount ))
360 if(
rfh0.update( Params.
la, kx, ElCount ))
366 rsv.update( SrcHeight, NewHeight, oy,
rfv,
spv );
367 rsh.update( SrcWidth, NewWidth, ox, *rfh );
376 const size_t FltWidthE = (
rsh.padl + SrcWidth +
rsh.padr ) * ElCount;
377 const double CacheSize = 5500000.0;
378 const double OpSize = (double) SrcScanlineSize * SrcHeight *
379 sizeof( Tin ) + (double) FltWidthE * NewHeight *
sizeof(
float );
381 int BatchSize = (int) ( NewHeight * CacheSize / ( OpSize + 1.0 ));
388 if( BatchSize > NewHeight )
390 BatchSize = NewHeight;
395 const int svs = (
rsv.padl + SrcHeight +
rsv.padr ) * ElCount;
396 float*
const pspv0 =
spv0;
407 static const bool IsInFloat = ( (Tin) 0.25f != 0 );
408 static const bool IsOutFloat = ( (Tout) 0.25f != 0 );
409 static const bool IsUnityMul = ( IsInFloat && IsOutFloat ) ||
410 ( IsInFloat == IsOutFloat &&
sizeof( Tin ) ==
sizeof( Tout ));
412 const int Clamp = (
sizeof( Tout ) == 1 ? 255 : 65535 );
413 const float OutMul = ( IsOutFloat ? 1.0f : (float) Clamp ) /
414 ( IsInFloat ? 1 : (
sizeof( Tin ) == 1 ? 255 : 65535 ));
424 const int bc = ( bl > BatchSize ? BatchSize : bl );
426 int kl =
rfv.KernelLen;
427 const Tin* ip = SrcBuf;
430 const int so = (int) rpv[ 0 ].so;
431 float*
const sp =
spv + so * ElCount;
433 int cc = (int) rpv[ bc - 1 ].so - so + kl;
437 const int socc = so + cc;
438 const int spe =
rsv.padl + SrcHeight;
445 if( socc <=
rsv.padl )
468 ip += SrcHeight * SrcScanlineSize;
478 ip += ( so -
rsv.padl ) * SrcScanlineSize;
488 for( i = 0; i < SrcWidth; i++ )
499 for( i = 0; i < SrcWidth; i++ )
501 copyScanline2v( ip, SrcScanlineSize, sp, cc, rl, rr );
502 resize2< false >( NULL, op, FltWidthE, rpv, kl, bc );
510 for( i = 0; i < SrcWidth; i++ )
512 copyScanline3v( ip, SrcScanlineSize, sp, cc, rl, rr );
513 resize3< false >( NULL, op, FltWidthE, rpv, kl, bc );
520 for( i = 0; i < SrcWidth; i++ )
522 copyScanline4v( ip, SrcScanlineSize, sp, cc, rl, rr );
523 resize4< false >( NULL, op, FltWidthE, rpv, kl, bc );
532 kl = rfh -> KernelLen;
536 for( i = 0; i < bc; i++ )
541 OutSLen, Clamp, OutMul );
544 opn += NewScanlineSize;
550 for( i = 0; i < bc; i++ )
552 padScanline2h( ipf,
rsh, SrcWidth );
553 resize2< true >( ipf,
spv, 2,
rsh.pos, kl, NewWidth );
555 OutSLen, Clamp, OutMul );
558 opn += NewScanlineSize;
564 for( i = 0; i < bc; i++ )
566 padScanline3h( ipf,
rsh, SrcWidth );
567 resize3< true >( ipf,
spv, 3,
rsh.pos, kl, NewWidth );
569 OutSLen, Clamp, OutMul );
572 opn += NewScanlineSize;
577 for( i = 0; i < bc; i++ )
579 padScanline4h( ipf,
rsh, SrcWidth );
580 resize4< true >( ipf,
spv, 4,
rsh.pos, kl, NewWidth );
582 OutSLen, Clamp, OutMul );
585 opn += NewScanlineSize;
625 template<
typename Tin,
typename Tout >
627 const int SrcHeight,
const int SrcSSize, Tout*
const NewBuf,
628 const int NewWidth,
const int NewHeight,
const int NewSSize,
629 const int ElCount,
const double kx0 = 0.0,
const double ky0 = 0.0,
630 double ox = 0.0,
double oy = 0.0 )
632 const CLancIRParams Params( SrcSSize, NewSSize, kx0, ky0, ox, oy );
634 return(
resizeImage( SrcBuf, SrcWidth, SrcHeight, NewBuf, NewWidth,
635 NewHeight, ElCount, &Params ));
663 template<
typename Tb,
typename Tl >
664 static void reallocBuf( Tb*& buf0, Tb*& buf, Tl& len, Tl newlen )
677 buf0 =
new Tb[ newlen ];
698 template<
typename Tb,
typename Tl >
710 buf =
new Tb[ newlen ];
715 class CResizeScanline;
723 friend class CResizeScanline;
763 bool update(
const double la0,
const double k0,
const int ElCount0 )
765 if( la0 ==
la && k0 ==
k && ElCount0 ==
ElCount )
770 const double NormFreq = ( k0 <= 1.0 ? 1.0 : 1.0 / k0 );
771 Freq = 3.1415926535897932 * NormFreq;
774 Len2 = la0 / NormFreq;
823 const int Frac = (int) ( x *
FracCount + 0.5 );
920 : svalue1( sin( ph ))
921 , svalue2( sin( ph - si ))
922 , sincr( 2.0 * cos( si ))
933 const double res = svalue1;
935 svalue1 = sincr * res - svalue2;
969 if( t + FracDelay < -
Len2 )
978 int IsZeroX = ( fabs( FracDelay - 1.0 ) < 2.3e-13 );
979 int mt = 0 - IsZeroX;
980 IsZeroX |= ( fabs( FracDelay ) < 2.3e-13 );
1017 ut = t + 1 + FracDelay;
1030 t = (int) ( op - op0 + 1 );
1034 *op0 = (float) ( *op0 * s );
1056 const float* ip = p + kl - 1;
1057 float* op = p + ( kl - 1 ) * erp;
1064 const float v = *ip;
1077 const float v = *ip;
1090 const float v = *ip;
1124 class CResizeScanline
1172 void update(
const int SrcLen0,
const int DstLen0,
const double o0,
1180 const int fl2m1 = rf.
fl2 - 1;
1181 padl = fl2m1 - (int) floor( o0 );
1191 const double k = rf.
k;
1193 const int DstLen_m1 = DstLen0 - 1;
1194 const double oe = o0 + k * DstLen_m1;
1195 const int ie = (int) floor( oe );
1197 padr = ie + rf.
fl2 + 1 - SrcLen0;
1207 const intptr_t ElCountF = rf.
ElCount *
sizeof( float );
1208 const int so =
padl - fl2m1;
1213 for( i = 0; i < DstLen_m1; i++ )
1215 const double ox = o0 + k * i;
1216 const int ix = (int) floor( ox );
1220 rp -> spo = (intptr_t) sp + rpso * ElCountF;
1227 rp -> spo = (intptr_t) sp + rpso * ElCountF;
1248 const intptr_t ElCountF = rf.
ElCount *
sizeof( float );
1252 for( i = 0; i <
DstLen; i++ )
1254 rp[ i ].
spo = (intptr_t) sp + rp[ i ].so * ElCountF;
1288 template<
typename T >
1290 int cc,
int repl,
int repr )
1296 v0 = (float) ip[ 0 ];
1303 }
while( --repl != 0 );
1308 op[ 0 ] = (float) ip[ 0 ];
1316 const T*
const ipe = ip - ipinc;
1317 v0 = (float) ipe[ 0 ];
1324 }
while( --repr != 0 );
1328 template<
typename T >
1329 static void copyScanline2v(
const T* ip,
const size_t ipinc,
float* op,
1330 int cc,
int repl,
int repr )
1336 v0 = (float) ip[ 0 ];
1337 v1 = (float) ip[ 1 ];
1345 }
while( --repl != 0 );
1350 op[ 0 ] = (float) ip[ 0 ];
1351 op[ 1 ] = (float) ip[ 1 ];
1359 const T*
const ipe = ip - ipinc;
1360 v0 = (float) ipe[ 0 ];
1361 v1 = (float) ipe[ 1 ];
1369 }
while( --repr != 0 );
1373 template<
typename T >
1374 static void copyScanline3v(
const T* ip,
const size_t ipinc,
float* op,
1375 int cc,
int repl,
int repr )
1381 v0 = (float) ip[ 0 ];
1382 v1 = (float) ip[ 1 ];
1383 v2 = (float) ip[ 2 ];
1392 }
while( --repl != 0 );
1397 op[ 0 ] = (float) ip[ 0 ];
1398 op[ 1 ] = (float) ip[ 1 ];
1399 op[ 2 ] = (float) ip[ 2 ];
1407 const T*
const ipe = ip - ipinc;
1408 v0 = (float) ipe[ 0 ];
1409 v1 = (float) ipe[ 1 ];
1410 v2 = (float) ipe[ 2 ];
1419 }
while( --repr != 0 );
1423 template<
typename T >
1424 static void copyScanline4v(
const T* ip,
const size_t ipinc,
float* op,
1425 int cc,
int repl,
int repr )
1427 float v0, v1, v2, v3;
1431 v0 = (float) ip[ 0 ];
1432 v1 = (float) ip[ 1 ];
1433 v2 = (float) ip[ 2 ];
1434 v3 = (float) ip[ 3 ];
1444 }
while( --repl != 0 );
1449 op[ 0 ] = (float) ip[ 0 ];
1450 op[ 1 ] = (float) ip[ 1 ];
1451 op[ 2 ] = (float) ip[ 2 ];
1452 op[ 3 ] = (float) ip[ 3 ];
1460 const T*
const ipe = ip - ipinc;
1461 v0 = (float) ipe[ 0 ];
1462 v1 = (float) ipe[ 1 ];
1463 v2 = (float) ipe[ 2 ];
1464 v3 = (float) ipe[ 3 ];
1474 }
while( --repr != 0 );
1495 const float* ip = op + rs.
padl;
1500 for( i = 0; i < rs.
padl; i++ )
1510 for( i = 0; i < rs.
padr; i++ )
1516 static void padScanline2h(
float* op, CResizeScanline& rs,
const int l )
1518 const float* ip = op + rs.padl * 2;
1524 for( i = 0; i < rs.padl; i++ )
1531 const int lc = l * 2;
1538 for( i = 0; i < rs.padr; i++ )
1546 static void padScanline3h(
float* op,
CResizeScanline& rs,
const int l )
1548 const float* ip = op + rs.padl * 3;
1555 for( i = 0; i < rs.padl; i++ )
1563 const int lc = l * 3;
1571 for( i = 0; i < rs.padr; i++ )
1580 static void padScanline4h(
float* op,
CResizeScanline& rs,
const int l )
1582 const float* ip = op + rs.padl * 4;
1590 for( i = 0; i < rs.padl; i++ )
1599 const int lc = l * 4;
1608 for( i = 0; i < rs.padr; i++ )
1635 const int vr = (int) ( v + 0.5f );
1637 return( vr > Clamp ? Clamp : vr );
1660 template<
bool IsOutFloat,
bool IsUnityMul,
typename T >
1662 const int Clamp,
const float OutMul )
1668 if(
sizeof( op[ 0 ]) ==
sizeof( ip[ 0 ]))
1670 memcpy( op, ip, l *
sizeof( op[ 0 ]));
1679 op[ 0 ] = (T) ip[ 0 ];
1680 op[ 1 ] = (T) ip[ 1 ];
1681 op[ 2 ] = (T) ip[ 2 ];
1682 op[ 3 ] = (T) ip[ 3 ];
1701 bool DoScalar =
true;
1703 if(
sizeof( op[ 0 ]) ==
sizeof( ip[ 0 ]))
1705 #if defined( LANCIR_SSE2 )
1708 const __m128 om = _mm_set1_ps( OutMul );
1712 _mm_storeu_ps( (
float*) op,
1713 _mm_mul_ps( _mm_load_ps( ip ), om ));
1720 #elif defined( LANCIR_NEON )
1723 const float32x4_t om = vdupq_n_f32( OutMul );
1727 vst1q_f32( (
float*) op,
1728 vmulq_f32( vld1q_f32( ip ), om ));
1742 op[ 0 ] = (T) ( ip[ 0 ] * OutMul );
1743 op[ 1 ] = (T) ( ip[ 1 ] * OutMul );
1744 op[ 2 ] = (T) ( ip[ 2 ] * OutMul );
1745 op[ 3 ] = (T) ( ip[ 3 ] * OutMul );
1754 *op = (T) ( *ip * OutMul );
1766 #if defined( LANCIR_SSE2 )
1768 const __m128 minv = _mm_setzero_ps();
1769 const __m128 maxv = _mm_set1_ps( (
float) Clamp );
1770 const __m128 om = _mm_set1_ps( OutMul );
1772 unsigned int prevrm = _MM_GET_ROUNDING_MODE();
1773 _MM_SET_ROUNDING_MODE( _MM_ROUND_NEAREST );
1775 if(
sizeof( op[ 0 ]) == 4 )
1779 const __m128 v = _mm_load_ps( ip );
1780 const __m128 cv = _mm_max_ps( _mm_min_ps(
1781 ( IsUnityMul ? v : _mm_mul_ps( v, om )),
1784 _mm_storeu_si128( (__m128i*) op, _mm_cvtps_epi32( cv ));
1792 if(
sizeof( op[ 0 ]) == 2 )
1796 const __m128 v = _mm_load_ps( ip );
1797 const __m128 cv = _mm_max_ps( _mm_min_ps(
1798 ( IsUnityMul ? v : _mm_mul_ps( v, om )),
1801 const __m128i v32 = _mm_cvtps_epi32( cv );
1802 const __m128i v16s = _mm_shufflehi_epi16(
1803 _mm_shufflelo_epi16( v32, 0 | 2 << 2 ), 0 | 2 << 2 );
1805 const __m128i v16 = _mm_shuffle_epi32( v16s, 0 | 2 << 2 );
1808 _mm_store_si128( &tmp, v16 );
1809 memcpy( op, &tmp, 8 );
1820 const __m128 v = _mm_load_ps( ip );
1821 const __m128 cv = _mm_max_ps( _mm_min_ps(
1822 ( IsUnityMul ? v : _mm_mul_ps( v, om )),
1825 const __m128i v32 = _mm_cvtps_epi32( cv );
1826 const __m128i v16s = _mm_shufflehi_epi16(
1827 _mm_shufflelo_epi16( v32, 0 | 2 << 2 ), 0 | 2 << 2 );
1829 const __m128i v16 = _mm_shuffle_epi32( v16s, 0 | 2 << 2 );
1830 const __m128i v8 = _mm_packus_epi16( v16, v16 );
1832 *(
int*) op = _mm_cvtsi128_si32( v8 );
1840 _MM_SET_ROUNDING_MODE( prevrm );
1842 #elif defined( LANCIR_NEON )
1844 const float32x4_t minv = vdupq_n_f32( 0.0f );
1845 const float32x4_t maxv = vdupq_n_f32( (
float) Clamp );
1846 const float32x4_t om = vdupq_n_f32( OutMul );
1847 const float32x4_t v05 = vdupq_n_f32( 0.5f );
1849 if(
sizeof( op[ 0 ]) == 4 )
1853 const float32x4_t v = vld1q_f32( ip );
1854 const float32x4_t cv = vmaxq_f32( vminq_f32(
1855 ( IsUnityMul ? v : vmulq_f32( v, om )),
1858 vst1q_u32( (
unsigned int*) op, vcvtq_u32_f32( vaddq_f32(
1867 if(
sizeof( op[ 0 ]) == 2 )
1871 const float32x4_t v = vld1q_f32( ip );
1872 const float32x4_t cv = vmaxq_f32( vminq_f32(
1873 ( IsUnityMul ? v : vmulq_f32( v, om )),
1876 const uint32x4_t v32 = vcvtq_u32_f32(
1877 vaddq_f32( cv, v05 ));
1879 const uint16x4_t v16 = vmovn_u32( v32 );
1881 vst1_u16( (
unsigned short*) op, v16 );
1892 const float32x4_t v = vld1q_f32( ip );
1893 const float32x4_t cv = vmaxq_f32( vminq_f32(
1894 ( IsUnityMul ? v : vmulq_f32( v, om )),
1897 const uint32x4_t v32 = vcvtq_u32_f32(
1898 vaddq_f32( cv, v05 ));
1900 const uint16x4_t v16 = vmovn_u32( v32 );
1901 const uint8x8_t v8 = vmovn_u16( vcombine_u16( v16, v16 ));
1903 *(
unsigned int*) op = vget_lane_u32( (uint32x2_t) v8, 0 );
1930 op[ 0 ] = (T)
roundclamp( ip[ 0 ] * OutMul, Clamp );
1931 op[ 1 ] = (T)
roundclamp( ip[ 1 ] * OutMul, Clamp );
1932 op[ 2 ] = (T)
roundclamp( ip[ 2 ] * OutMul, Clamp );
1933 op[ 3 ] = (T)
roundclamp( ip[ 3 ] * OutMul, Clamp );
1970 #define LANCIR_LF_PRE \
1971 const CResizePos* const rpe = rp + DstLen; \
1972 while( rp != rpe ) \
1974 const float* flt = rp -> flt; \
1978 ip = (const float*) ( (intptr_t) sp + rp -> spo ); \
1982 ip = (const float*) rp -> spo; \
1990 #define LANCIR_LF_POST \
2009 template<
bool UseSP >
2010 static void resize1(
const float*
const sp,
float* op,
const size_t opinc,
2011 const CResizePos* rp,
const int kl,
const int DstLen )
2013 const int ci = kl >> 2;
2015 if(( kl & 3 ) == 0 )
2021 #if defined( LANCIR_SSE2 )
2023 __m128 sum = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));
2029 sum = _mm_add_ps( sum, _mm_mul_ps( _mm_load_ps( flt ),
2030 _mm_loadu_ps( ip )));
2033 sum = _mm_add_ps( sum, _mm_movehl_ps( sum, sum ));
2035 _mm_store_ss( op, _mm_add_ss( sum,
2036 _mm_shuffle_ps( sum, sum, 1 )));
2038 #elif defined( LANCIR_NEON )
2040 float32x4_t sum = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip ));
2046 sum = vmlaq_f32( sum, vld1q_f32( flt ), vld1q_f32( ip ));
2049 #if defined( LANCIR_ARM32 )
2050 const float32x2_t sum2 = vadd_f32( vget_high_f32( sum ),
2051 vget_low_f32( sum ));
2053 op[ 0 ] = vget_lane_f32( sum2, 0 ) + vget_lane_f32( sum2, 1 );
2055 op[ 0 ] = vaddvq_f32( sum );
2060 float sum0 = flt[ 0 ] * ip[ 0 ];
2061 float sum1 = flt[ 1 ] * ip[ 1 ];
2062 float sum2 = flt[ 2 ] * ip[ 2 ];
2063 float sum3 = flt[ 3 ] * ip[ 3 ];
2069 sum0 += flt[ 0 ] * ip[ 0 ];
2070 sum1 += flt[ 1 ] * ip[ 1 ];
2071 sum2 += flt[ 2 ] * ip[ 2 ];
2072 sum3 += flt[ 3 ] * ip[ 3 ];
2075 op[ 0 ] = ( sum0 + sum1 ) + ( sum2 + sum3 );
2087 #if defined( LANCIR_SSE2 )
2089 __m128 sum = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));
2095 sum = _mm_add_ps( sum, _mm_mul_ps( _mm_load_ps( flt ),
2096 _mm_loadu_ps( ip )));
2099 sum = _mm_add_ps( sum, _mm_movehl_ps( sum, sum ));
2101 const __m128 sum2 = _mm_mul_ps( _mm_loadu_ps( flt + 2 ),
2102 _mm_loadu_ps( ip + 2 ));
2104 sum = _mm_add_ps( sum, _mm_movehl_ps( sum2, sum2 ));
2106 _mm_store_ss( op, _mm_add_ss( sum,
2107 _mm_shuffle_ps( sum, sum, 1 )));
2109 #elif defined( LANCIR_NEON )
2111 float32x4_t sum = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip ));
2117 sum = vmlaq_f32( sum, vld1q_f32( flt ), vld1q_f32( ip ));
2120 float32x2_t sum2 = vadd_f32( vget_high_f32( sum ),
2121 vget_low_f32( sum ));
2123 sum2 = vmla_f32( sum2, vld1_f32( flt + 4 ), vld1_f32( ip + 4 ));
2125 #if defined( LANCIR_ARM32 )
2126 op[ 0 ] = vget_lane_f32( sum2, 0 ) + vget_lane_f32( sum2, 1 );
2128 op[ 0 ] = vaddv_f32( sum2 );
2133 float sum0 = flt[ 0 ] * ip[ 0 ];
2134 float sum1 = flt[ 1 ] * ip[ 1 ];
2135 float sum2 = flt[ 2 ] * ip[ 2 ];
2136 float sum3 = flt[ 3 ] * ip[ 3 ];
2142 sum0 += flt[ 0 ] * ip[ 0 ];
2143 sum1 += flt[ 1 ] * ip[ 1 ];
2144 sum2 += flt[ 2 ] * ip[ 2 ];
2145 sum3 += flt[ 3 ] * ip[ 3 ];
2148 op[ 0 ] = ( sum0 + sum1 ) + ( sum2 + sum3 ) +
2149 flt[ 4 ] * ip[ 4 ] + flt[ 5 ] * ip[ 5 ];
2157 template<
bool UseSP >
2158 static void resize2(
const float*
const sp,
float* op,
const size_t opinc,
2159 const CResizePos* rp,
const int kl,
const int DstLen )
2161 #if LANCIR_ALIGN > 4
2162 const int ci = kl >> 2;
2163 const int cir = kl & 3;
2165 const int ci = kl >> 1;
2172 #if defined( LANCIR_AVX )
2174 __m256 sum = _mm256_mul_ps( _mm256_load_ps( flt ),
2175 _mm256_loadu_ps( ip ));
2181 sum = _mm256_add_ps( sum, _mm256_mul_ps( _mm256_load_ps( flt ),
2182 _mm256_loadu_ps( ip )));
2185 __m128 res = _mm_add_ps( _mm256_extractf128_ps( sum, 0 ),
2186 _mm256_extractf128_ps( sum, 1 ));
2190 res = _mm_add_ps( res, _mm_mul_ps( _mm_load_ps( flt + 8 ),
2191 _mm_loadu_ps( ip + 8 )));
2194 res = _mm_add_ps( res, _mm_movehl_ps( res, res ));
2196 _mm_store_ss( op, res );
2197 _mm_store_ss( op + 1, _mm_shuffle_ps( res, res, 1 ));
2199 #elif defined( LANCIR_SSE2 )
2201 __m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));
2202 __m128 sumB = _mm_mul_ps( _mm_load_ps( flt + 4 ),
2203 _mm_loadu_ps( ip + 4 ));
2209 sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ),
2210 _mm_loadu_ps( ip )));
2212 sumB = _mm_add_ps( sumB, _mm_mul_ps( _mm_load_ps( flt + 4 ),
2213 _mm_loadu_ps( ip + 4 )));
2216 sumA = _mm_add_ps( sumA, sumB );
2220 sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt + 8 ),
2221 _mm_loadu_ps( ip + 8 )));
2224 sumA = _mm_add_ps( sumA, _mm_movehl_ps( sumA, sumA ));
2226 _mm_store_ss( op, sumA );
2227 _mm_store_ss( op + 1, _mm_shuffle_ps( sumA, sumA, 1 ));
2229 #elif defined( LANCIR_NEON )
2231 float32x4_t sumA = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip ));
2232 float32x4_t sumB = vmulq_f32( vld1q_f32( flt + 4 ),
2233 vld1q_f32( ip + 4 ));
2239 sumA = vmlaq_f32( sumA, vld1q_f32( flt ), vld1q_f32( ip ));
2240 sumB = vmlaq_f32( sumB, vld1q_f32( flt + 4 ),
2241 vld1q_f32( ip + 4 ));
2244 sumA = vaddq_f32( sumA, sumB );
2248 sumA = vmlaq_f32( sumA, vld1q_f32( flt + 8 ),
2249 vld1q_f32( ip + 8 ));
2252 vst1_f32( op, vadd_f32( vget_high_f32( sumA ), vget_low_f32( sumA )));
2256 const float xx = flt[ 0 ];
2257 const float xx2 = flt[ 1 ];
2258 float sum0 = xx * ip[ 0 ];
2259 float sum1 = xx * ip[ 1 ];
2260 float sum2 = xx2 * ip[ 2 ];
2261 float sum3 = xx2 * ip[ 3 ];
2267 const float xx = flt[ 0 ];
2268 const float xx2 = flt[ 1 ];
2269 sum0 += xx * ip[ 0 ];
2270 sum1 += xx * ip[ 1 ];
2271 sum2 += xx2 * ip[ 2 ];
2272 sum3 += xx2 * ip[ 3 ];
2275 op[ 0 ] = sum0 + sum2;
2276 op[ 1 ] = sum1 + sum3;
2283 template<
bool UseSP >
2284 static void resize3(
const float*
const sp,
float* op,
const size_t opinc,
2285 const CResizePos* rp,
const int kl,
const int DstLen )
2287 #if LANCIR_ALIGN > 4
2289 const int ci = kl >> 2;
2290 const int cir = kl & 3;
2297 #if defined( LANCIR_AVX )
2299 __m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));
2300 __m256 sumB = _mm256_mul_ps( _mm256_loadu_ps( flt + 4 ),
2301 _mm256_loadu_ps( ip + 4 ));
2307 sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ),
2308 _mm_loadu_ps( ip )));
2310 sumB = _mm256_add_ps( sumB, _mm256_mul_ps(
2311 _mm256_loadu_ps( flt + 4 ), _mm256_loadu_ps( ip + 4 )));
2316 sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt + 12 ),
2317 _mm_loadu_ps( ip + 12 )));
2320 _mm_storeu_ps( res, sumA );
2322 float o0 = res[ 0 ] + res[ 3 ];
2323 float o1 = res[ 1 ];
2324 float o2 = res[ 2 ];
2326 _mm256_storeu_ps( res + 4, sumB );
2331 #elif defined( LANCIR_SSE2 )
2333 __m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_loadu_ps( ip ));
2334 __m128 sumB = _mm_mul_ps( _mm_load_ps( flt + 4 ),
2335 _mm_loadu_ps( ip + 4 ));
2337 __m128 sumC = _mm_mul_ps( _mm_load_ps( flt + 8 ),
2338 _mm_loadu_ps( ip + 8 ));
2344 sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ),
2345 _mm_loadu_ps( ip )));
2347 sumB = _mm_add_ps( sumB, _mm_mul_ps( _mm_load_ps( flt + 4 ),
2348 _mm_loadu_ps( ip + 4 )));
2350 sumC = _mm_add_ps( sumC, _mm_mul_ps( _mm_load_ps( flt + 8 ),
2351 _mm_loadu_ps( ip + 8 )));
2356 sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt + 12 ),
2357 _mm_loadu_ps( ip + 12 )));
2360 _mm_storeu_ps( res, sumA );
2361 _mm_storeu_ps( res + 4, sumB );
2363 float o0 = res[ 0 ] + res[ 3 ];
2364 float o1 = res[ 1 ] + res[ 4 ];
2365 float o2 = res[ 2 ] + res[ 5 ];
2367 _mm_storeu_ps( res + 8, sumC );
2369 #elif defined( LANCIR_NEON )
2371 float32x4_t sumA = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip ));
2372 float32x4_t sumB = vmulq_f32( vld1q_f32( flt + 4 ),
2373 vld1q_f32( ip + 4 ));
2375 float32x4_t sumC = vmulq_f32( vld1q_f32( flt + 8 ),
2376 vld1q_f32( ip + 8 ));
2382 sumA = vmlaq_f32( sumA, vld1q_f32( flt ), vld1q_f32( ip ));
2383 sumB = vmlaq_f32( sumB, vld1q_f32( flt + 4 ),
2384 vld1q_f32( ip + 4 ));
2386 sumC = vmlaq_f32( sumC, vld1q_f32( flt + 8 ),
2387 vld1q_f32( ip + 8 ));
2392 sumA = vmlaq_f32( sumA, vld1q_f32( flt + 12 ),
2393 vld1q_f32( ip + 12 ));
2396 vst1q_f32( res, sumA );
2397 vst1q_f32( res + 4, sumB );
2399 float o0 = res[ 0 ] + res[ 3 ];
2400 float o1 = res[ 1 ] + res[ 4 ];
2401 float o2 = res[ 2 ] + res[ 5 ];
2403 vst1q_f32( res + 8, sumC );
2407 o0 += res[ 6 ] + res[ 9 ];
2408 o1 += res[ 7 ] + res[ 10 ];
2409 o2 += res[ 8 ] + res[ 11 ];
2413 o1 += flt[ 16 ] * ip[ 16 ];
2414 o2 += flt[ 17 ] * ip[ 17 ];
2423 const int ci = kl >> 1;
2429 const float xx = flt[ 0 ];
2430 float sum0 = xx * ip[ 0 ];
2431 float sum1 = xx * ip[ 1 ];
2432 float sum2 = xx * ip[ 2 ];
2433 const float xx2 = flt[ 1 ];
2434 float sum3 = xx2 * ip[ 3 ];
2435 float sum4 = xx2 * ip[ 4 ];
2436 float sum5 = xx2 * ip[ 5 ];
2442 const float xx = flt[ 0 ];
2443 sum0 += xx * ip[ 0 ];
2444 sum1 += xx * ip[ 1 ];
2445 sum2 += xx * ip[ 2 ];
2446 const float xx2 = flt[ 1 ];
2447 sum3 += xx2 * ip[ 3 ];
2448 sum4 += xx2 * ip[ 4 ];
2449 sum5 += xx2 * ip[ 5 ];
2452 op[ 0 ] = sum0 + sum3;
2453 op[ 1 ] = sum1 + sum4;
2454 op[ 2 ] = sum2 + sum5;
2461 template<
bool UseSP >
2462 static void resize4(
const float*
const sp,
float* op,
const size_t opinc,
2463 const CResizePos* rp,
const int kl,
const int DstLen )
2465 #if LANCIR_ALIGN > 4
2466 const int ci = kl >> 1;
2475 #if defined( LANCIR_AVX )
2477 __m256 sum = _mm256_mul_ps( _mm256_load_ps( flt ),
2478 _mm256_loadu_ps( ip ));
2484 sum = _mm256_add_ps( sum, _mm256_mul_ps( _mm256_load_ps( flt ),
2485 _mm256_loadu_ps( ip )));
2488 _mm_store_ps( op, _mm_add_ps( _mm256_extractf128_ps( sum, 0 ),
2489 _mm256_extractf128_ps( sum, 1 )));
2491 #elif defined( LANCIR_SSE2 )
2493 __m128 sumA = _mm_mul_ps( _mm_load_ps( flt ), _mm_load_ps( ip ));
2494 __m128 sumB = _mm_mul_ps( _mm_load_ps( flt + 4 ),
2495 _mm_load_ps( ip + 4 ));
2501 sumA = _mm_add_ps( sumA, _mm_mul_ps( _mm_load_ps( flt ),
2502 _mm_load_ps( ip )));
2504 sumB = _mm_add_ps( sumB, _mm_mul_ps( _mm_load_ps( flt + 4 ),
2505 _mm_load_ps( ip + 4 )));
2508 _mm_store_ps( op, _mm_add_ps( sumA, sumB ));
2510 #elif defined( LANCIR_NEON )
2512 float32x4_t sumA = vmulq_f32( vld1q_f32( flt ), vld1q_f32( ip ));
2513 float32x4_t sumB = vmulq_f32( vld1q_f32( flt + 4 ),
2514 vld1q_f32( ip + 4 ));
2520 sumA = vmlaq_f32( sumA, vld1q_f32( flt ), vld1q_f32( ip ));
2521 sumB = vmlaq_f32( sumB, vld1q_f32( flt + 4 ),
2522 vld1q_f32( ip + 4 ));
2525 vst1q_f32( op, vaddq_f32( sumA, sumB ));
2529 const float xx = flt[ 0 ];
2530 float sum0 = xx * ip[ 0 ];
2531 float sum1 = xx * ip[ 1 ];
2532 float sum2 = xx * ip[ 2 ];
2533 float sum3 = xx * ip[ 3 ];
2539 const float xx = flt[ 0 ];
2540 sum0 += xx * ip[ 0 ];
2541 sum1 += xx * ip[ 1 ];
2542 sum2 += xx * ip[ 2 ];
2543 sum3 += xx * ip[ 3 ];
2558 #undef LANCIR_LF_PRE
2559 #undef LANCIR_LF_POST