mirror of https://code.videolan.org/videolan/x264
arm/aarch64: use plane_copy wrapper macros
Move the macros to common/mc.h to share them across all architectures. Fixes possible buffer overreads if the width of the user supplied frames is not a multiple of 16. Reported-by: Kirill Batuzov <batuzovk@ispras.ru>
This commit is contained in:
parent
3f5ed56d41
commit
5caef139cf
|
@ -1253,7 +1253,7 @@ load_deinterleave_chroma:
|
|||
ret
|
||||
endfunc
|
||||
|
||||
function x264_plane_copy_neon, export=1
|
||||
function x264_plane_copy_core_neon, export=1
|
||||
add x8, x4, #15
|
||||
and x4, x8, #~15
|
||||
sub x1, x1, x4
|
||||
|
@ -1352,7 +1352,7 @@ function x264_plane_copy_deinterleave_rgb_neon, export=1
|
|||
ret
|
||||
endfunc
|
||||
|
||||
function x264_plane_copy_interleave_neon, export=1
|
||||
function x264_plane_copy_interleave_core_neon, export=1
|
||||
add w9, w6, #15
|
||||
and w9, w9, #0xfffffff0
|
||||
sub x1, x1, x9, lsl #1
|
||||
|
|
|
@ -49,8 +49,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
|
|||
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||
|
||||
void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
|
||||
pixel *dstv, intptr_t i_dstv,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
|
@ -58,9 +58,9 @@ void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
|
|||
pixel *dstb, intptr_t i_dstb,
|
||||
pixel *dstc, intptr_t i_dstc,
|
||||
pixel *src, intptr_t i_src, int pw, int w, int h );
|
||||
void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *srcu, intptr_t i_srcu,
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *srcu, intptr_t i_srcu,
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||
|
||||
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
|
||||
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||
|
@ -206,6 +206,9 @@ static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride,
|
|||
void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
|
||||
uint8_t *src, intptr_t stride, int width,
|
||||
int height, int16_t *buf );
|
||||
|
||||
PLANE_COPY(16, neon)
|
||||
PLANE_INTERLEAVE(neon)
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
|
||||
PROPAGATE_LIST(neon)
|
||||
|
|
|
@ -1468,7 +1468,7 @@ function x264_load_deinterleave_chroma_fenc_neon
|
|||
bx lr
|
||||
endfunc
|
||||
|
||||
function x264_plane_copy_neon
|
||||
function x264_plane_copy_core_neon
|
||||
push {r4,lr}
|
||||
ldr r4, [sp, #8]
|
||||
ldr lr, [sp, #12]
|
||||
|
@ -1577,7 +1577,7 @@ block4:
|
|||
pop {r4-r8, r10, r11, pc}
|
||||
endfunc
|
||||
|
||||
function x264_plane_copy_interleave_neon
|
||||
function x264_plane_copy_interleave_core_neon
|
||||
push {r4-r7, lr}
|
||||
ldrd r6, r7, [sp, #28]
|
||||
ldrd r4, r5, [sp, #20]
|
||||
|
@ -1604,7 +1604,7 @@ blocki:
|
|||
pop {r4-r7, pc}
|
||||
endfunc
|
||||
|
||||
function x264_plane_copy_swap_neon
|
||||
function x264_plane_copy_swap_core_neon
|
||||
push {r4-r5, lr}
|
||||
ldrd r4, r5, [sp, #12]
|
||||
add lr, r4, #15
|
||||
|
|
|
@ -48,8 +48,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
|
|||
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||
|
||||
void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
|
||||
pixel *dstv, intptr_t i_dstv,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
|
@ -57,11 +57,11 @@ void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
|
|||
pixel *dstb, intptr_t i_dstb,
|
||||
pixel *dstc, intptr_t i_dstc,
|
||||
pixel *src, intptr_t i_src, int pw, int w, int h );
|
||||
void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *srcu, intptr_t i_srcu,
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||
void x264_plane_copy_swap_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *srcu, intptr_t i_srcu,
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
|
||||
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
|
||||
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||
|
@ -232,6 +232,10 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
|
|||
src += stride;
|
||||
}
|
||||
}
|
||||
|
||||
PLANE_COPY(16, neon)
|
||||
PLANE_COPY_SWAP(16, neon)
|
||||
PLANE_INTERLEAVE(neon)
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
|
||||
PROPAGATE_LIST(neon)
|
||||
|
|
92
common/mc.h
92
common/mc.h
|
@ -100,6 +100,98 @@ static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, in
|
|||
}\
|
||||
}
|
||||
|
||||
void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
|
||||
|
||||
#define PLANE_COPY(align, cpu)\
|
||||
static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
|
||||
{\
|
||||
int c_w = (align) / sizeof(pixel) - 1;\
|
||||
if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
|
||||
x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
|
||||
else if( !(w&c_w) )\
|
||||
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
|
||||
else\
|
||||
{\
|
||||
if( --h > 0 )\
|
||||
{\
|
||||
if( i_src > 0 )\
|
||||
{\
|
||||
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
|
||||
dst += i_dst * h;\
|
||||
src += i_src * h;\
|
||||
}\
|
||||
else\
|
||||
x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
|
||||
}\
|
||||
/* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
|
||||
memcpy( dst, src, w*sizeof(pixel) );\
|
||||
}\
|
||||
}
|
||||
|
||||
void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
|
||||
|
||||
#define PLANE_COPY_SWAP(align, cpu)\
|
||||
static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
|
||||
{\
|
||||
int c_w = (align>>1) / sizeof(pixel) - 1;\
|
||||
if( !(w&c_w) )\
|
||||
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
|
||||
else if( w > c_w )\
|
||||
{\
|
||||
if( --h > 0 )\
|
||||
{\
|
||||
if( i_src > 0 )\
|
||||
{\
|
||||
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
|
||||
dst += i_dst * h;\
|
||||
src += i_src * h;\
|
||||
}\
|
||||
else\
|
||||
x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
|
||||
}\
|
||||
x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
|
||||
for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
|
||||
{\
|
||||
dst[x] = src[x+1];\
|
||||
dst[x+1] = src[x];\
|
||||
}\
|
||||
}\
|
||||
else\
|
||||
x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
|
||||
}
|
||||
|
||||
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
|
||||
pixel *srcu, intptr_t i_srcu,
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||
|
||||
#define PLANE_INTERLEAVE(cpu) \
|
||||
static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\
|
||||
pixel *srcu, intptr_t i_srcu,\
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h )\
|
||||
{\
|
||||
int c_w = 16 / sizeof(pixel) - 1;\
|
||||
if( !(w&c_w) )\
|
||||
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
|
||||
else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
|
||||
{\
|
||||
if( --h > 0 )\
|
||||
{\
|
||||
if( i_srcu > 0 )\
|
||||
{\
|
||||
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
|
||||
dst += i_dst * h;\
|
||||
srcu += i_srcu * h;\
|
||||
srcv += i_srcv * h;\
|
||||
}\
|
||||
else\
|
||||
x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
|
||||
}\
|
||||
x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
|
||||
}\
|
||||
else\
|
||||
x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
|
||||
}
|
||||
|
||||
struct x264_weight_t;
|
||||
typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int );
|
||||
typedef struct x264_weight_t
|
||||
|
|
|
@ -88,10 +88,8 @@ void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
|
|||
void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
|
||||
void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
|
||||
void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
|
||||
void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
|
||||
void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
|
||||
void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
|
||||
void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
|
||||
void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst,
|
||||
pixel *srcu, intptr_t i_srcu,
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||
|
@ -101,9 +99,6 @@ void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst,
|
|||
void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst,
|
||||
pixel *srcu, intptr_t i_srcu,
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
|
||||
pixel *srcu, intptr_t i_srcu,
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||
void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu,
|
||||
pixel *dstv, intptr_t i_dstv,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
|
@ -493,96 +488,12 @@ HPEL(32, avx2, avx2, avx2, avx2)
|
|||
#endif
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
|
||||
#define PLANE_COPY(align, cpu)\
|
||||
static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
|
||||
{\
|
||||
int c_w = (align) / sizeof(pixel) - 1;\
|
||||
if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
|
||||
x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
|
||||
else if( !(w&c_w) )\
|
||||
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
|
||||
else\
|
||||
{\
|
||||
if( --h > 0 )\
|
||||
{\
|
||||
if( i_src > 0 )\
|
||||
{\
|
||||
x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
|
||||
dst += i_dst * h;\
|
||||
src += i_src * h;\
|
||||
}\
|
||||
else\
|
||||
x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
|
||||
}\
|
||||
/* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
|
||||
memcpy( dst, src, w*sizeof(pixel) );\
|
||||
}\
|
||||
}
|
||||
|
||||
PLANE_COPY(16, sse)
|
||||
PLANE_COPY(32, avx)
|
||||
|
||||
#define PLANE_COPY_SWAP(align, cpu)\
|
||||
static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
|
||||
{\
|
||||
int c_w = (align>>1) / sizeof(pixel) - 1;\
|
||||
if( !(w&c_w) )\
|
||||
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
|
||||
else if( w > c_w )\
|
||||
{\
|
||||
if( --h > 0 )\
|
||||
{\
|
||||
if( i_src > 0 )\
|
||||
{\
|
||||
x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
|
||||
dst += i_dst * h;\
|
||||
src += i_src * h;\
|
||||
}\
|
||||
else\
|
||||
x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
|
||||
}\
|
||||
x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
|
||||
for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
|
||||
{\
|
||||
dst[x] = src[x+1];\
|
||||
dst[x+1] = src[x];\
|
||||
}\
|
||||
}\
|
||||
else\
|
||||
x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
|
||||
}
|
||||
|
||||
PLANE_COPY_SWAP(16, ssse3)
|
||||
PLANE_COPY_SWAP(32, avx2)
|
||||
|
||||
#define PLANE_INTERLEAVE(cpu) \
|
||||
static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\
|
||||
pixel *srcu, intptr_t i_srcu,\
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h )\
|
||||
{\
|
||||
int c_w = 16 / sizeof(pixel) - 1;\
|
||||
if( !(w&c_w) )\
|
||||
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
|
||||
else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
|
||||
{\
|
||||
if( --h > 0 )\
|
||||
{\
|
||||
if( i_srcu > 0 )\
|
||||
{\
|
||||
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
|
||||
dst += i_dst * h;\
|
||||
srcu += i_srcu * h;\
|
||||
srcv += i_srcv * h;\
|
||||
}\
|
||||
else\
|
||||
x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
|
||||
}\
|
||||
x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
|
||||
}\
|
||||
else\
|
||||
x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
|
||||
}
|
||||
|
||||
PLANE_INTERLEAVE(mmx2)
|
||||
PLANE_INTERLEAVE(sse2)
|
||||
#if HIGH_BIT_DEPTH
|
||||
|
|
Loading…
Reference in New Issue