From 818bfe7f0a3ff243deb63c4b146de2563f38ffd4 Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Fri, 24 Jul 2015 08:24:21 +0200 Subject: [PATCH] hevcdsp: split the epel functions by width This should allow for more efficient SIMD. --- libavcodec/hevc.c | 29 ++++++-------- libavcodec/hevcdsp.c | 18 +++++++-- libavcodec/hevcdsp.h | 6 +-- libavcodec/hevcdsp_template.c | 75 +++++++++++++++++++++++++++-------- 4 files changed, 89 insertions(+), 39 deletions(-) diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c index f2303ac6f5..63d3bc7256 100644 --- a/libavcodec/hevc.c +++ b/libavcodec/hevc.c @@ -1533,7 +1533,7 @@ static void luma_mc(HEVCContext *s, int16_t *dst, ptrdiff_t dststride, */ static void chroma_mc(HEVCContext *s, int16_t *dst1, int16_t *dst2, ptrdiff_t dststride, AVFrame *ref, const Mv *mv, - int x_off, int y_off, int block_w, int block_h) + int x_off, int y_off, int block_w, int block_h, int pred_idx) { HEVCLocalContext *lc = &s->HEVClc; uint8_t *src1 = ref->data[1]; @@ -1571,8 +1571,8 @@ static void chroma_mc(HEVCContext *s, int16_t *dst1, int16_t *dst2, src1 = lc->edge_emu_buffer + buf_offset1; src1stride = edge_emu_stride; - s->hevcdsp.put_hevc_epel[!!my][!!mx](dst1, dststride, src1, src1stride, - block_w, block_h, mx, my, lc->mc_buffer); + s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst1, dststride, src1, src1stride, + block_h, mx, my, lc->mc_buffer); s->vdsp.emulated_edge_mc(lc->edge_emu_buffer, src2 - offset2, edge_emu_stride, src2stride, @@ -1583,16 +1583,13 @@ static void chroma_mc(HEVCContext *s, int16_t *dst1, int16_t *dst2, src2 = lc->edge_emu_buffer + buf_offset2; src2stride = edge_emu_stride; - s->hevcdsp.put_hevc_epel[!!my][!!mx](dst2, dststride, src2, src2stride, - block_w, block_h, mx, my, - lc->mc_buffer); + s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst2, dststride, src2, src2stride, + block_h, mx, my, lc->mc_buffer); } else { - s->hevcdsp.put_hevc_epel[!!my][!!mx](dst1, dststride, src1, src1stride, - block_w, block_h, mx, my, - lc->mc_buffer); - s->hevcdsp.put_hevc_epel[!!my][!!mx](dst2, dststride, src2, src2stride, - block_w, block_h, mx, my, - lc->mc_buffer); + s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst1, dststride, src1, src1stride, + block_h, mx, my, lc->mc_buffer); + s->hevcdsp.put_hevc_epel[!!my][!!mx][pred_idx](dst2, dststride, src2, src2stride, + block_h, mx, my, lc->mc_buffer); } } @@ -1737,7 +1734,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, s->hevcdsp.put_unweighted_pred(dst0, s->frame->linesize[0], tmp, tmpstride, nPbW, nPbH); } chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame, - ¤t_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2); + ¤t_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx); if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { @@ -1774,7 +1771,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, } chroma_mc(s, tmp, tmp2, tmpstride, ref1->frame, - ¤t_mv.mv[1], x0/2, y0/2, nPbW/2, nPbH/2); + ¤t_mv.mv[1], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx); if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { @@ -1816,9 +1813,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, } chroma_mc(s, tmp, tmp2, tmpstride, ref0->frame, - ¤t_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2); + ¤t_mv.mv[0], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx); chroma_mc(s, tmp3, tmp4, tmpstride, ref1->frame, - ¤t_mv.mv[1], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2); + ¤t_mv.mv[1], x0 / 2, y0 / 2, nPbW / 2, nPbH / 2, pred_idx); if ((s->sh.slice_type == P_SLICE && s->ps.pps->weighted_pred_flag) || (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag)) { diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c index 86d9e85b92..7f42399dfc 100644 --- a/libavcodec/hevcdsp.c +++ b/libavcodec/hevcdsp.c @@ -122,6 +122,12 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) hevcdsp->put_hevc_qpel[1][0][i] = FUNC(put_hevc_qpel_v_ ## width, depth); \ hevcdsp->put_hevc_qpel[1][1][i] = FUNC(put_hevc_qpel_hv_ ## width, depth); \ +#define EPEL_FUNC(i, width, depth) \ + hevcdsp->put_hevc_epel[0][0][i] = FUNC(put_hevc_epel_pixels_ ## width, depth); \ + hevcdsp->put_hevc_epel[0][1][i] = FUNC(put_hevc_epel_h_ ## width, depth); \ + hevcdsp->put_hevc_epel[1][0][i] = FUNC(put_hevc_epel_v_ ## width, depth); \ + hevcdsp->put_hevc_epel[1][1][i] = FUNC(put_hevc_epel_hv_ ## width, depth); \ + #define HEVC_DSP(depth) \ hevcdsp->put_pcm = FUNC(put_pcm, depth); \ hevcdsp->transquant_bypass[0] = FUNC(transquant_bypass4x4, depth); \ @@ -154,10 +160,14 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) QPEL_FUNC(6, 48, depth); \ QPEL_FUNC(7, 64, depth); \ \ - hevcdsp->put_hevc_epel[0][0] = FUNC(put_hevc_epel_pixels, depth); \ - hevcdsp->put_hevc_epel[0][1] = FUNC(put_hevc_epel_h, depth); \ - hevcdsp->put_hevc_epel[1][0] = FUNC(put_hevc_epel_v, depth); \ - hevcdsp->put_hevc_epel[1][1] = FUNC(put_hevc_epel_hv, depth); \ + EPEL_FUNC(0, 2, depth); \ + EPEL_FUNC(1, 4, depth); \ + EPEL_FUNC(2, 6, depth); \ + EPEL_FUNC(3, 8, depth); \ + EPEL_FUNC(4, 12, depth); \ + EPEL_FUNC(5, 16, depth); \ + EPEL_FUNC(6, 24, depth); \ + EPEL_FUNC(7, 32, depth); \ \ hevcdsp->put_unweighted_pred = FUNC(put_unweighted_pred, depth); \ hevcdsp->put_unweighted_pred_avg = FUNC(put_unweighted_pred_avg, depth); \ diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h index c250385341..e906c5e4aa 100644 --- a/libavcodec/hevcdsp.h +++ b/libavcodec/hevcdsp.h @@ -61,9 +61,9 @@ typedef struct HEVCDSPContext { void (*put_hevc_qpel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, int height, int mx, int my, int16_t *mcbuffer); - void (*put_hevc_epel[2][2])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, - ptrdiff_t srcstride, int width, int height, - int mx, int my, int16_t *mcbuffer); + void (*put_hevc_epel[2][2][8])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, + ptrdiff_t srcstride, int height, + int mx, int my, int16_t *mcbuffer); void (*put_unweighted_pred)(uint8_t *dst, ptrdiff_t dststride, int16_t *src, ptrdiff_t srcstride, int width, int height); diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c index 84503ec2f6..d832904dcb 100644 --- a/libavcodec/hevcdsp_template.c +++ b/libavcodec/hevcdsp_template.c @@ -982,10 +982,10 @@ QPEL(12) QPEL(8) QPEL(4) -static void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride, - uint8_t *_src, ptrdiff_t _srcstride, - int width, int height, int mx, int my, - int16_t* mcbuffer) +static inline void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride, + uint8_t *_src, ptrdiff_t _srcstride, + int width, int height, int mx, int my, + int16_t* mcbuffer) { int x, y; pixel *src = (pixel *)_src; @@ -1005,10 +1005,10 @@ static void FUNC(put_hevc_epel_pixels)(int16_t *dst, ptrdiff_t dststride, filter_2 * src[x + stride] + \ filter_3 * src[x + 2 * stride]) -static void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride, - uint8_t *_src, ptrdiff_t _srcstride, - int width, int height, int mx, int my, - int16_t* mcbuffer) +static inline void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride, + uint8_t *_src, ptrdiff_t _srcstride, + int width, int height, int mx, int my, + int16_t* mcbuffer) { int x, y; pixel *src = (pixel *)_src; @@ -1026,10 +1026,10 @@ static void FUNC(put_hevc_epel_h)(int16_t *dst, ptrdiff_t dststride, } } -static void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride, - uint8_t *_src, ptrdiff_t _srcstride, - int width, int height, int mx, int my, - int16_t* mcbuffer) +static inline void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride, + uint8_t *_src, ptrdiff_t _srcstride, + int width, int height, int mx, int my, + int16_t* mcbuffer) { int x, y; pixel *src = (pixel *)_src; @@ -1048,10 +1048,10 @@ static void FUNC(put_hevc_epel_v)(int16_t *dst, ptrdiff_t dststride, } } -static void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride, - uint8_t *_src, ptrdiff_t _srcstride, - int width, int height, int mx, int my, - int16_t* mcbuffer) +static inline void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride, + uint8_t *_src, ptrdiff_t _srcstride, + int width, int height, int mx, int my, + int16_t* mcbuffer) { int x, y; pixel *src = (pixel *)_src; @@ -1087,6 +1087,49 @@ static void FUNC(put_hevc_epel_hv)(int16_t *dst, ptrdiff_t dststride, } } +#define EPEL(W) \ +static void FUNC(put_hevc_epel_pixels_ ## W)(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, \ + int16_t *mcbuffer) \ +{ \ + FUNC(put_hevc_epel_pixels)(dst, dststride, src, srcstride, \ + W, height, mx, my, mcbuffer); \ +} \ +static void FUNC(put_hevc_epel_h_ ## W)(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, \ + int16_t *mcbuffer) \ +{ \ + FUNC(put_hevc_epel_h)(dst, dststride, src, srcstride, \ + W, height, mx, my, mcbuffer); \ +} \ +static void FUNC(put_hevc_epel_v_ ## W)(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, \ + int16_t *mcbuffer) \ +{ \ + FUNC(put_hevc_epel_v)(dst, dststride, src, srcstride, \ + W, height, mx, my, mcbuffer); \ +} \ +static void FUNC(put_hevc_epel_hv_ ## W)(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, \ + int16_t *mcbuffer) \ +{ \ + FUNC(put_hevc_epel_hv)(dst, dststride, src, srcstride, \ + W, height, mx, my, mcbuffer); \ +} + +EPEL(32) +EPEL(24) +EPEL(16) +EPEL(12) +EPEL(8) +EPEL(6) +EPEL(4) +EPEL(2) + static void FUNC(put_unweighted_pred)(uint8_t *_dst, ptrdiff_t _dststride, int16_t *src, ptrdiff_t srcstride, int width, int height)