Merge zero buffers

Improves cache efficiency.
2024-11-15 03:32:13 +01:00 · 2018-01-16 17:43:24 +01:00 · 2018-01-16 17:43:24 +01:00 · b019515ef4
commit b019515ef4
parent d75b93b0e8
7 changed files with 14 additions and 15 deletions
--- a/common/mc.c
+++ b/common/mc.c
@ -155,7 +155,7 @@ static weight_fn_t mc_weight_wtab[6] =
    mc_weight_w16,
    mc_weight_w20,
 };
-const x264_weight_t x264_weight_none[3] = { {{0}} };
+
 static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
 {
    for( int y = 0; y < i_height; y++ )
--- a/common/mc.h
+++ b/common/mc.h
@ -244,8 +244,7 @@ typedef struct x264_weight_t
    weight_fn_t *weightfn;
 } ALIGNED_16( x264_weight_t );

-#define x264_weight_none x264_template(weight_none)
-extern const x264_weight_t x264_weight_none[3];
+#define x264_weight_none ((const x264_weight_t*)x264_zero)

 #define SET_WEIGHT( w, b, s, d, o )\
 {\
--- a/common/tables.c
+++ b/common/tables.c
@ -2534,3 +2534,6 @@ const vlc_t x264_run_before_init[7][16] =
        { 0x1, 11 }, /* str=00000000001 */
    },
 };
+
+/* psy_trellis_init() has the largest size requirement of 16*FDEC_STRIDE*sizeof(pixel) */
+ALIGNED_64( uint8_t x264_zero[1024] ) = { 0 };
--- a/common/tables.h
+++ b/common/tables.h
@ -94,4 +94,6 @@ extern const vlc_t x264_total_zeros_2x2_dc[3][4];
 extern const vlc_t x264_total_zeros_2x4_dc[7][8];
 extern const vlc_t x264_run_before_init[7][16];

+extern uint8_t x264_zero[1024];
+
 #endif
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@ -558,12 +558,10 @@ static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra,
 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 static void inline psy_trellis_init( x264_t *h, int do_both_dct )
 {
-    ALIGNED_64( static pixel zero[16*FDEC_STRIDE] ) = {0};
-
    if( do_both_dct || h->mb.b_transform_8x8 )
-        h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
+        h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], (pixel*)x264_zero );
    if( do_both_dct || !h->mb.b_transform_8x8 )
-        h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
+        h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], (pixel*)x264_zero );
 }

 /* Reset fenc satd scores cache for psy RD */
--- a/encoder/me.c
+++ b/encoder/me.c
@ -633,7 +633,6 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
            /* successive elimination by comparing DC before a full SAD,
             * because sum(abs(diff)) >= abs(diff(sum)). */
            uint16_t *sums_base = m->integral;
-            ALIGNED_16( static pixel zero[8*FENC_STRIDE] ) = {0};
            ALIGNED_ARRAY_16( int, enc_dc,[4] );
            int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
            int delta = x264_pixel_size[sad_size].w;
@ -641,7 +640,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
            int xn;
            uint16_t *cost_fpel_mvx = h->cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);

-            h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
+            h->pixf.sad_x4[sad_size]( (pixel*)x264_zero, p_fenc, p_fenc+delta,
                p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
                FENC_STRIDE, enc_dc );
            if( delta == 4 )
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@ -96,7 +96,6 @@ static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
    static const uint8_t satd_shift_x[3] = {3,   2,   2};
    static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
    static const uint8_t  satd_offset[3] = {0,   8,   16};
-    ALIGNED_16( static pixel zero[16] ) = {0};
    int cache_index = (x >> satd_shift_x[size - PIXEL_8x4]) + (y >> satd_shift_y[size - PIXEL_8x4])
                    + satd_offset[size - PIXEL_8x4];
    int res = h->mb.pic.fenc_satd_cache[cache_index];
@ -105,8 +104,8 @@ static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
    else
    {
        pixel *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
-        int dc = h->pixf.sad[size]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
-        res = h->pixf.satd[size]( fenc, FENC_STRIDE, zero, 0 ) - dc;
+        int dc = h->pixf.sad[size]( fenc, FENC_STRIDE, (pixel*)x264_zero, 0 ) >> 1;
+        res = h->pixf.satd[size]( fenc, FENC_STRIDE, (pixel*)x264_zero, 0 ) - dc;
        h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
        return res;
    }
@ -123,7 +122,6 @@ static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )

 static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
 {
-    ALIGNED_16( static pixel zero[16] ) = {0};
    int satd = 0;
    pixel *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
    pixel *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
@ -140,8 +138,8 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
        }
        else
        {
-            int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
-            satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - cached_satd( h, size, x, y ));
+            int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, (pixel*)x264_zero, 0 ) >> 1;
+            satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, (pixel*)x264_zero, 0 ) - dc - cached_satd( h, size, x, y ));
        }
        satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
    }