diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index 575e79fe5e..fbbd0696b7 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -22,6 +22,7 @@ OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \ OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_init_arm.o OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o +OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o @@ -61,6 +62,7 @@ ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \ arm/idctdsp_armv6.o \ arm/simple_idct_armv6.o ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o +ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_armv6.o ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o ARMV6-OBJS-$(CONFIG_VC1_DECODER) += arm/startcode_armv6.o diff --git a/libavcodec/arm/dsputil_armv6.S b/libavcodec/arm/dsputil_armv6.S index 8876d5fa18..60232243e5 100644 --- a/libavcodec/arm/dsputil_armv6.S +++ b/libavcodec/arm/dsputil_armv6.S @@ -297,58 +297,3 @@ function ff_sse16_armv6, export=1 pop {r4-r9, pc} endfunc - -function ff_pix_norm1_armv6, export=1 - push {r4-r6, lr} - mov r12, #16 - mov lr, #0 -1: - ldm r0, {r2-r5} - uxtb16 r6, r2 - uxtb16 r2, r2, ror #8 - smlad lr, r6, r6, lr - uxtb16 r6, r3 - smlad lr, r2, r2, lr - uxtb16 r3, r3, ror #8 - smlad lr, r6, r6, lr - uxtb16 r6, r4 - smlad lr, r3, r3, lr - uxtb16 r4, r4, ror #8 - smlad lr, r6, r6, lr - uxtb16 r6, r5 - smlad lr, r4, r4, lr - uxtb16 r5, r5, ror #8 - smlad lr, r6, r6, lr - subs r12, r12, #1 - add r0, r0, r1 - smlad lr, r5, r5, lr - bgt 1b - - mov r0, lr - pop {r4-r6, pc} -endfunc - -function ff_pix_sum_armv6, export=1 - push {r4-r7, lr} - mov r12, #16 - mov r2, #0 - mov r3, #0 - mov lr, #0 - ldr r4, [r0] -1: - subs r12, r12, #1 - ldr r5, [r0, #4] - usada8 r2, r4, lr, r2 - ldr r6, [r0, #8] - usada8 r3, r5, lr, r3 - ldr r7, [r0, #12] - usada8 r2, r6, lr, r2 - beq 2f - ldr_pre r4, r0, r1 - usada8 r3, r7, lr, r3 - bgt 1b -2: - usada8 r3, r7, lr, r3 - add r0, r2, r3 - pop {r4-r7, pc} -endfunc diff --git a/libavcodec/arm/dsputil_init_armv6.c b/libavcodec/arm/dsputil_init_armv6.c index 57b90daa1e..1cfad42183 100644 --- a/libavcodec/arm/dsputil_init_armv6.c +++ b/libavcodec/arm/dsputil_init_armv6.c @@ -43,9 +43,6 @@ int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, int line_size, int h); -int ff_pix_norm1_armv6(uint8_t *pix, int line_size); -int ff_pix_sum_armv6(uint8_t *pix, int line_size); - av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { @@ -63,7 +60,4 @@ av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx, c->sad[1] = ff_pix_abs8_armv6; c->sse[0] = ff_sse16_armv6; - - c->pix_norm1 = ff_pix_norm1_armv6; - c->pix_sum = ff_pix_sum_armv6; } diff --git a/libavcodec/arm/mpegvideoencdsp_armv6.S b/libavcodec/arm/mpegvideoencdsp_armv6.S new file mode 100644 index 0000000000..ab0dad7b18 --- /dev/null +++ b/libavcodec/arm/mpegvideoencdsp_armv6.S @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_pix_norm1_armv6, export=1 + push {r4-r6, lr} + mov r12, #16 + mov lr, #0 +1: + ldm r0, {r2-r5} + uxtb16 r6, r2 + uxtb16 r2, r2, ror #8 + smlad lr, r6, r6, lr + uxtb16 r6, r3 + smlad lr, r2, r2, lr + uxtb16 r3, r3, ror #8 + smlad lr, r6, r6, lr + uxtb16 r6, r4 + smlad lr, r3, r3, lr + uxtb16 r4, r4, ror #8 + smlad lr, r6, r6, lr + uxtb16 r6, r5 + smlad lr, r4, r4, lr + uxtb16 r5, r5, ror #8 + smlad lr, r6, r6, lr + subs r12, r12, #1 + add r0, r0, r1 + smlad lr, r5, r5, lr + bgt 1b + + mov r0, lr + pop {r4-r6, pc} +endfunc + +function ff_pix_sum_armv6, export=1 + push {r4-r7, lr} + mov r12, #16 + mov r2, #0 + mov r3, #0 + mov lr, #0 + ldr r4, [r0] +1: + subs r12, r12, #1 + ldr r5, [r0, #4] + usada8 r2, r4, lr, r2 + ldr r6, [r0, #8] + usada8 r3, r5, lr, r3 + ldr r7, [r0, #12] + usada8 r2, r6, lr, r2 + beq 2f + ldr_pre r4, r0, r1 + usada8 r3, r7, lr, r3 + bgt 1b +2: + usada8 r3, r7, lr, r3 + add r0, r2, r3 + pop {r4-r7, pc} +endfunc diff --git a/libavcodec/arm/mpegvideoencdsp_init_arm.c b/libavcodec/arm/mpegvideoencdsp_init_arm.c new file mode 100644 index 0000000000..4bfe835684 --- /dev/null +++ b/libavcodec/arm/mpegvideoencdsp_init_arm.c @@ -0,0 +1,38 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/mpegvideoencdsp.h" + +int ff_pix_norm1_armv6(uint8_t *pix, int line_size); +int ff_pix_sum_armv6(uint8_t *pix, int line_size); + +av_cold void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c, + AVCodecContext *avctx) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_armv6(cpu_flags)) { + c->pix_norm1 = ff_pix_norm1_armv6; + c->pix_sum = ff_pix_sum_armv6; + } +} diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c index 850427e3fc..45b882da40 100644 --- a/libavcodec/dnxhdenc.c +++ b/libavcodec/dnxhdenc.c @@ -323,6 +323,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx) ff_blockdsp_init(&ctx->bdsp, avctx); ff_idctdsp_init(&ctx->m.idsp, avctx); + ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx); ff_dct_common_init(&ctx->m); ff_dct_encode_init(&ctx->m); @@ -733,8 +734,8 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, int varc; if (!partial_last_row && mb_x * 16 <= avctx->width - 16) { - sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize); - varc = ctx->m.dsp.pix_norm1(pix, ctx->m.linesize); + sum = ctx->m.mpvencdsp.pix_sum(pix, ctx->m.linesize); + varc = ctx->m.mpvencdsp.pix_norm1(pix, ctx->m.linesize); } else { int bw = FFMIN(avctx->width - 16 * mb_x, 16); int bh = FFMIN((avctx->height >> ctx->interlaced) - 16 * mb_y, 16); diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index fe3c135713..640a4bfa9c 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -26,7 +26,6 @@ */ #include "libavutil/attributes.h" -#include "libavutil/imgutils.h" #include "libavutil/internal.h" #include "avcodec.h" #include "copy_block.h" @@ -34,8 +33,6 @@ #include "dsputil.h" #include "simple_idct.h" #include "faandct.h" -#include "imgconvert.h" -#include "mathops.h" #include "mpegvideo.h" #include "config.h" @@ -48,74 +45,6 @@ uint32_t ff_square_tab[512] = { 0, }; #define BIT_DEPTH 8 #include "dsputilenc_template.c" -static int pix_sum_c(uint8_t *pix, int line_size) -{ - int s = 0, i, j; - - for (i = 0; i < 16; i++) { - for (j = 0; j < 16; j += 8) { - s += pix[0]; - s += pix[1]; - s += pix[2]; - s += pix[3]; - s += pix[4]; - s += pix[5]; - s += pix[6]; - s += pix[7]; - pix += 8; - } - pix += line_size - 16; - } - return s; -} - -static int pix_norm1_c(uint8_t *pix, int line_size) -{ - int s = 0, i, j; - uint32_t *sq = ff_square_tab + 256; - - for (i = 0; i < 16; i++) { - for (j = 0; j < 16; j += 8) { -#if 0 - s += sq[pix[0]]; - s += sq[pix[1]]; - s += sq[pix[2]]; - s += sq[pix[3]]; - s += sq[pix[4]]; - s += sq[pix[5]]; - s += sq[pix[6]]; - s += sq[pix[7]]; -#else -#if HAVE_FAST_64BIT - register uint64_t x = *(uint64_t *) pix; - s += sq[x & 0xff]; - s += sq[(x >> 8) & 0xff]; - s += sq[(x >> 16) & 0xff]; - s += sq[(x >> 24) & 0xff]; - s += sq[(x >> 32) & 0xff]; - s += sq[(x >> 40) & 0xff]; - s += sq[(x >> 48) & 0xff]; - s += sq[(x >> 56) & 0xff]; -#else - register uint32_t x = *(uint32_t *) pix; - s += sq[x & 0xff]; - s += sq[(x >> 8) & 0xff]; - s += sq[(x >> 16) & 0xff]; - s += sq[(x >> 24) & 0xff]; - x = *(uint32_t *) (pix + 4); - s += sq[x & 0xff]; - s += sq[(x >> 8) & 0xff]; - s += sq[(x >> 16) & 0xff]; - s += sq[(x >> 24) & 0xff]; -#endif -#endif - pix += 8; - } - pix += line_size - 16; - } - return s; -} - static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { @@ -1094,9 +1023,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) c->sum_abs_dctelem = sum_abs_dctelem_c; - c->pix_sum = pix_sum_c; - c->pix_norm1 = pix_norm1_c; - /* TODO [0] 16 [1] 8 */ c->pix_abs[0][0] = pix_abs16_c; c->pix_abs[0][1] = pix_abs16_x2_c; @@ -1141,11 +1067,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) ff_dsputil_init_dwt(c); #endif - c->shrink[0] = av_image_copy_plane; - c->shrink[1] = ff_shrink22; - c->shrink[2] = ff_shrink44; - c->shrink[3] = ff_shrink88; - c->draw_edges = draw_edges_8_c; switch (avctx->bits_per_raw_sample) { diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 8633c90056..8dafbbd9d7 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -72,9 +72,6 @@ typedef struct DSPContext { int stride); int (*sum_abs_dctelem)(int16_t *block /* align 16 */); - int (*pix_sum)(uint8_t *pix, int line_size); - int (*pix_norm1)(uint8_t *pix, int line_size); - me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */ me_cmp_func sse[6]; me_cmp_func hadamard8_diff[6]; @@ -108,9 +105,6 @@ typedef struct DSPContext { #define EDGE_WIDTH 16 #define EDGE_TOP 1 #define EDGE_BOTTOM 2 - - void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, - int src_wrap, int width, int height); } DSPContext; void ff_dsputil_static_init(void); diff --git a/libavcodec/motion_est.c b/libavcodec/motion_est.c index b093580e8f..6b3cd61e8a 100644 --- a/libavcodec/motion_est.c +++ b/libavcodec/motion_est.c @@ -903,8 +903,9 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, /* intra / predictive decision */ pix = c->src[0][0]; - sum = s->dsp.pix_sum(pix, s->linesize); - varc = s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500; + sum = s->mpvencdsp.pix_sum(pix, s->linesize); + varc = s->mpvencdsp.pix_norm1(pix, s->linesize) - + (((unsigned) sum * sum) >> 8) + 500; pic->mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8; pic->mb_var [s->mb_stride * mb_y + mb_x] = (varc+128)>>8; diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c index 4a453f3723..0d630df4ea 100644 --- a/libavcodec/mpegvideo_enc.c +++ b/libavcodec/mpegvideo_enc.c @@ -1010,7 +1010,7 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src, int offset = x + y * stride; int sad = s->dsp.sad[0](NULL, src + offset, ref + offset, stride, 16); - int mean = (s->dsp.pix_sum(src + offset, stride) + 128) >> 8; + int mean = (s->mpvencdsp.pix_sum(src + offset, stride) + 128) >> 8; int sae = get_sae(src + offset, mean, stride); acc += sae + 500 < sad; @@ -1278,15 +1278,21 @@ static int estimate_best_b_count(MpegEncContext *s) data[2] += INPLACE_OFFSET; } - s->dsp.shrink[scale](s->tmp_frames[i]->data[0], s->tmp_frames[i]->linesize[0], - data[0], pre_input.f->linesize[0], - c->width, c->height); - s->dsp.shrink[scale](s->tmp_frames[i]->data[1], s->tmp_frames[i]->linesize[1], - data[1], pre_input.f->linesize[1], - c->width >> 1, c->height >> 1); - s->dsp.shrink[scale](s->tmp_frames[i]->data[2], s->tmp_frames[i]->linesize[2], - data[2], pre_input.f->linesize[2], - c->width >> 1, c->height >> 1); + s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[0], + s->tmp_frames[i]->linesize[0], + data[0], + pre_input.f->linesize[0], + c->width, c->height); + s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[1], + s->tmp_frames[i]->linesize[1], + data[1], + pre_input.f->linesize[1], + c->width >> 1, c->height >> 1); + s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[2], + s->tmp_frames[i]->linesize[2], + data[2], + pre_input.f->linesize[2], + c->width >> 1, c->height >> 1); } } @@ -2585,9 +2591,10 @@ static int mb_var_thread(AVCodecContext *c, void *arg){ int yy = mb_y * 16; uint8_t *pix = s->new_picture.f->data[0] + (yy * s->linesize) + xx; int varc; - int sum = s->dsp.pix_sum(pix, s->linesize); + int sum = s->mpvencdsp.pix_sum(pix, s->linesize); - varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500 + 128)>>8; + varc = (s->mpvencdsp.pix_norm1(pix, s->linesize) - + (((unsigned) sum * sum) >> 8) + 500 + 128) >> 8; s->current_picture.mb_var [s->mb_stride * mb_y + mb_x] = varc; s->current_picture.mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8; diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c index c5e0b4874c..bde4345750 100644 --- a/libavcodec/mpegvideoencdsp.c +++ b/libavcodec/mpegvideoencdsp.c @@ -22,7 +22,10 @@ #include "config.h" #include "libavutil/avassert.h" #include "libavutil/attributes.h" +#include "libavutil/imgutils.h" #include "avcodec.h" +#include "dsputil.h" +#include "imgconvert.h" #include "mpegvideoencdsp.h" static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], @@ -54,12 +57,92 @@ static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale) (BASIS_SHIFT - RECON_SHIFT); } +static int pix_sum_c(uint8_t *pix, int line_size) +{ + int s = 0, i, j; + + for (i = 0; i < 16; i++) { + for (j = 0; j < 16; j += 8) { + s += pix[0]; + s += pix[1]; + s += pix[2]; + s += pix[3]; + s += pix[4]; + s += pix[5]; + s += pix[6]; + s += pix[7]; + pix += 8; + } + pix += line_size - 16; + } + return s; +} + +static int pix_norm1_c(uint8_t *pix, int line_size) +{ + int s = 0, i, j; + uint32_t *sq = ff_square_tab + 256; + + for (i = 0; i < 16; i++) { + for (j = 0; j < 16; j += 8) { +#if 0 + s += sq[pix[0]]; + s += sq[pix[1]]; + s += sq[pix[2]]; + s += sq[pix[3]]; + s += sq[pix[4]]; + s += sq[pix[5]]; + s += sq[pix[6]]; + s += sq[pix[7]]; +#else +#if HAVE_FAST_64BIT + register uint64_t x = *(uint64_t *) pix; + s += sq[x & 0xff]; + s += sq[(x >> 8) & 0xff]; + s += sq[(x >> 16) & 0xff]; + s += sq[(x >> 24) & 0xff]; + s += sq[(x >> 32) & 0xff]; + s += sq[(x >> 40) & 0xff]; + s += sq[(x >> 48) & 0xff]; + s += sq[(x >> 56) & 0xff]; +#else + register uint32_t x = *(uint32_t *) pix; + s += sq[x & 0xff]; + s += sq[(x >> 8) & 0xff]; + s += sq[(x >> 16) & 0xff]; + s += sq[(x >> 24) & 0xff]; + x = *(uint32_t *) (pix + 4); + s += sq[x & 0xff]; + s += sq[(x >> 8) & 0xff]; + s += sq[(x >> 16) & 0xff]; + s += sq[(x >> 24) & 0xff]; +#endif +#endif + pix += 8; + } + pix += line_size - 16; + } + return s; +} + av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c, AVCodecContext *avctx) { c->try_8x8basis = try_8x8basis_c; c->add_8x8basis = add_8x8basis_c; + c->shrink[0] = av_image_copy_plane; + c->shrink[1] = ff_shrink22; + c->shrink[2] = ff_shrink44; + c->shrink[3] = ff_shrink88; + + c->pix_sum = pix_sum_c; + c->pix_norm1 = pix_norm1_c; + + if (ARCH_ARM) + ff_mpegvideoencdsp_init_arm(c, avctx); + if (ARCH_PPC) + ff_mpegvideoencdsp_init_ppc(c, avctx); if (ARCH_X86) ff_mpegvideoencdsp_init_x86(c, avctx); } diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h index a7bc2ae2c5..81e3fe67b0 100644 --- a/libavcodec/mpegvideoencdsp.h +++ b/libavcodec/mpegvideoencdsp.h @@ -31,10 +31,19 @@ typedef struct MpegvideoEncDSPContext { int16_t basis[64], int scale); void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale); + int (*pix_sum)(uint8_t *pix, int line_size); + int (*pix_norm1)(uint8_t *pix, int line_size); + + void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, + int src_wrap, int width, int height); } MpegvideoEncDSPContext; void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c, AVCodecContext *avctx); +void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c, + AVCodecContext *avctx); +void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c, + AVCodecContext *avctx); void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, AVCodecContext *avctx); diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile index 88aaf2644a..c357dafbac 100644 --- a/libavcodec/ppc/Makefile +++ b/libavcodec/ppc/Makefile @@ -13,6 +13,7 @@ OBJS-$(CONFIG_IDCTDSP) += ppc/idctdsp.o OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o \ ppc/mpegvideodsp.o +OBJS-$(CONFIG_MPEGVIDEOENC) += ppc/mpegvideoencdsp.o OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c index 20b15b019e..5ab1b51e2b 100644 --- a/libavcodec/ppc/dsputil_altivec.c +++ b/libavcodec/ppc/dsputil_altivec.c @@ -308,34 +308,6 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, return s; } -static int pix_norm1_altivec(uint8_t *pix, int line_size) -{ - int i, s = 0; - const vector unsigned int zero = - (const vector unsigned int) vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix); - vector unsigned int sv = (vector unsigned int) vec_splat_u32(0); - vector signed int sum; - - for (i = 0; i < 16; i++) { - /* Read the potentially unaligned pixels. */ - vector unsigned char pixl = vec_ld(0, pix); - vector unsigned char pixr = vec_ld(15, pix); - vector unsigned char pixv = vec_perm(pixl, pixr, perm); - - /* Square the values, and add them to our sum. */ - sv = vec_msum(pixv, pixv, sv); - - pix += line_size; - } - /* Sum up the four partial sums, and put the result into s. */ - sum = vec_sums((vector signed int) sv, (vector signed int) zero); - sum = vec_splat(sum, 3); - vec_ste(sum, 0, &s); - - return s; -} - /* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. * It's the sad8_altivec code above w/ squaring added. */ static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, @@ -430,35 +402,6 @@ static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, return s; } -static int pix_sum_altivec(uint8_t *pix, int line_size) -{ - int i, s; - const vector unsigned int zero = - (const vector unsigned int) vec_splat_u32(0); - vector unsigned char perm = vec_lvsl(0, pix); - vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); - vector signed int sumdiffs; - - for (i = 0; i < 16; i++) { - /* Read the potentially unaligned 16 pixels into t1. */ - vector unsigned char pixl = vec_ld(0, pix); - vector unsigned char pixr = vec_ld(15, pix); - vector unsigned char t1 = vec_perm(pixl, pixr, perm); - - /* Add each 4 pixel group together and put 4 results into sad. */ - sad = vec_sum4s(t1, sad); - - pix += line_size; - } - - /* Sum up the four partial sums, and put the result into s. */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, int line_size) { @@ -911,9 +854,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx, c->sse[0] = sse16_altivec; c->sse[1] = sse8_altivec; - c->pix_norm1 = pix_norm1_altivec; - c->pix_sum = pix_sum_altivec; - c->diff_pixels = diff_pixels_altivec; if (!high_bit_depth) { diff --git a/libavcodec/ppc/mpegvideoencdsp.c b/libavcodec/ppc/mpegvideoencdsp.c new file mode 100644 index 0000000000..00ae2a6f30 --- /dev/null +++ b/libavcodec/ppc/mpegvideoencdsp.c @@ -0,0 +1,103 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include +#if HAVE_ALTIVEC_H +#include +#endif + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/types_altivec.h" +#include "libavutil/ppc/util_altivec.h" +#include "libavcodec/mpegvideoencdsp.h" + +#if HAVE_ALTIVEC + +static int pix_norm1_altivec(uint8_t *pix, int line_size) +{ + int i, s = 0; + const vector unsigned int zero = + (const vector unsigned int) vec_splat_u32(0); + vector unsigned char perm = vec_lvsl(0, pix); + vector unsigned int sv = (vector unsigned int) vec_splat_u32(0); + vector signed int sum; + + for (i = 0; i < 16; i++) { + /* Read the potentially unaligned pixels. */ + vector unsigned char pixl = vec_ld(0, pix); + vector unsigned char pixr = vec_ld(15, pix); + vector unsigned char pixv = vec_perm(pixl, pixr, perm); + + /* Square the values, and add them to our sum. */ + sv = vec_msum(pixv, pixv, sv); + + pix += line_size; + } + /* Sum up the four partial sums, and put the result into s. */ + sum = vec_sums((vector signed int) sv, (vector signed int) zero); + sum = vec_splat(sum, 3); + vec_ste(sum, 0, &s); + + return s; +} + +static int pix_sum_altivec(uint8_t *pix, int line_size) +{ + int i, s; + const vector unsigned int zero = + (const vector unsigned int) vec_splat_u32(0); + vector unsigned char perm = vec_lvsl(0, pix); + vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); + vector signed int sumdiffs; + + for (i = 0; i < 16; i++) { + /* Read the potentially unaligned 16 pixels into t1. */ + vector unsigned char pixl = vec_ld(0, pix); + vector unsigned char pixr = vec_ld(15, pix); + vector unsigned char t1 = vec_perm(pixl, pixr, perm); + + /* Add each 4 pixel group together and put 4 results into sad. */ + sad = vec_sum4s(t1, sad); + + pix += line_size; + } + + /* Sum up the four partial sums, and put the result into s. */ + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); + sumdiffs = vec_splat(sumdiffs, 3); + vec_ste(sumdiffs, 0, &s); + + return s; +} + +#endif /* HAVE_ALTIVEC */ + +av_cold void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c, + AVCodecContext *avctx) +{ +#if HAVE_ALTIVEC + if (!PPC_ALTIVEC(av_get_cpu_flags())) + return; + + c->pix_norm1 = pix_norm1_altivec; + c->pix_sum = pix_sum_altivec; +#endif /* HAVE_ALTIVEC */ +} diff --git a/libavcodec/svq1enc.c b/libavcodec/svq1enc.c index 6d1b397fd2..9ff690dce2 100644 --- a/libavcodec/svq1enc.c +++ b/libavcodec/svq1enc.c @@ -517,6 +517,7 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx) ff_dsputil_init(&s->dsp, avctx); ff_hpeldsp_init(&s->hdsp, avctx->flags); + ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx); avctx->coded_frame = av_frame_alloc(); s->current_picture = av_frame_alloc(); diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index f757be177d..ac336c7d86 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -109,6 +109,7 @@ YASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o YASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o YASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o +YASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ x86/fpel.o \ x86/qpel.o diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm index 84cb7b363b..13682ba5d4 100644 --- a/libavcodec/x86/dsputilenc.asm +++ b/libavcodec/x86/dsputilenc.asm @@ -23,10 +23,6 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA - -cextern pw_1 - SECTION .text %macro DIFF_PIXELS_1 4 @@ -465,113 +461,6 @@ cglobal diff_pixels, 4, 5, 5 jne .loop RET -; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) -; %1 = number of xmm registers used -; %2 = number of loops -; %3 = number of GPRs used -%macro PIX_SUM16 4 -cglobal pix_sum16, 2, %3, %1 - movsxdifnidn r1, r1d - mov r2, %2 -%if cpuflag(xop) - lea r3, [r1*3] -%else - pxor m5, m5 -%endif - pxor m4, m4 -.loop: -%if cpuflag(xop) - vphaddubq m0, [r0] - vphaddubq m1, [r0+r1] - vphaddubq m2, [r0+r1*2] - vphaddubq m3, [r0+r3] -%else - mova m0, [r0] -%if mmsize == 8 - mova m1, [r0+8] -%else - mova m1, [r0+r1] -%endif - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%endif ; cpuflag(xop) - paddw m1, m0 - paddw m3, m2 - paddw m3, m1 - paddw m4, m3 -%if mmsize == 8 - add r0, r1 -%else - lea r0, [r0+r1*%4] -%endif - dec r2 - jne .loop -%if cpuflag(xop) - pshufd m0, m4, q0032 - paddd m4, m0 -%else - HADDW m4, m5 -%endif - movd eax, m4 - RET -%endmacro - -INIT_MMX mmx -PIX_SUM16 0, 16, 3, 0 -INIT_XMM sse2 -PIX_SUM16 6, 8, 3, 2 -%if HAVE_XOP_EXTERNAL -INIT_XMM xop -PIX_SUM16 5, 4, 4, 4 -%endif - -; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) -; %1 = number of xmm registers used -; %2 = number of loops -%macro PIX_NORM1 2 -cglobal pix_norm1, 2, 3, %1 - movsxdifnidn r1, r1d - mov r2, %2 - pxor m0, m0 - pxor m5, m5 -.loop: - mova m2, [r0+0] -%if mmsize == 8 - mova m3, [r0+8] -%else - mova m3, [r0+r1] -%endif - punpckhbw m1, m2, m0 - punpcklbw m2, m0 - punpckhbw m4, m3, m0 - punpcklbw m3, m0 - pmaddwd m1, m1 - pmaddwd m2, m2 - pmaddwd m3, m3 - pmaddwd m4, m4 - paddd m2, m1 - paddd m4, m3 - paddd m5, m2 - paddd m5, m4 -%if mmsize == 8 - add r0, r1 -%else - lea r0, [r0+r1*2] -%endif - dec r2 - jne .loop - HADDD m5, m1 - movd eax, m5 - RET -%endmacro - -INIT_MMX mmx -PIX_NORM1 0, 16 -INIT_XMM sse2 -PIX_NORM1 6, 8 - ;----------------------------------------------- ;int ff_sum_abs_dctelem(int16_t *block) ;----------------------------------------------- diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index f235ad0a53..9e3078b144 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -37,11 +37,6 @@ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride); void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride); -int ff_pix_sum16_mmx(uint8_t *pix, int line_size); -int ff_pix_sum16_sse2(uint8_t *pix, int line_size); -int ff_pix_sum16_xop(uint8_t *pix, int line_size); -int ff_pix_norm1_mmx(uint8_t *pix, int line_size); -int ff_pix_norm1_sse2(uint8_t *pix, int line_size); int ff_sum_abs_dctelem_mmx(int16_t *block); int ff_sum_abs_dctelem_mmxext(int16_t *block); int ff_sum_abs_dctelem_sse2(int16_t *block); @@ -364,8 +359,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, if (!high_bit_depth) c->get_pixels = ff_get_pixels_mmx; c->diff_pixels = ff_diff_pixels_mmx; - c->pix_sum = ff_pix_sum16_mmx; - c->pix_norm1 = ff_pix_norm1_mmx; } if (EXTERNAL_SSE2(cpu_flags)) @@ -431,8 +424,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, c->sse[0] = ff_sse16_sse2; c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; c->diff_pixels = ff_diff_pixels_sse2; - c->pix_sum = ff_pix_sum16_sse2; - c->pix_norm1 = ff_pix_norm1_sse2; #if HAVE_ALIGNED_STACK c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; @@ -448,9 +439,5 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, #endif } - if (EXTERNAL_XOP(cpu_flags)) { - c->pix_sum = ff_pix_sum16_xop; - } - ff_dsputil_init_pix_mmx(c, avctx); } diff --git a/libavcodec/x86/mpegvideoencdsp.asm b/libavcodec/x86/mpegvideoencdsp.asm new file mode 100644 index 0000000000..4fe6cfe5a6 --- /dev/null +++ b/libavcodec/x86/mpegvideoencdsp.asm @@ -0,0 +1,137 @@ +;***************************************************************************** +;* SIMD-optimized MPEG encoding functions +;***************************************************************************** +;* Copyright (c) 2000, 2001 Fabrice Bellard +;* Copyright (c) 2002-2004 Michael Niedermayer +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;***************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pw_1 + +SECTION .text +; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) +; %1 = number of xmm registers used +; %2 = number of loops +; %3 = number of GPRs used +%macro PIX_SUM16 4 +cglobal pix_sum16, 2, %3, %1 + movsxdifnidn r1, r1d + mov r2, %2 +%if cpuflag(xop) + lea r3, [r1*3] +%else + pxor m5, m5 +%endif + pxor m4, m4 +.loop: +%if cpuflag(xop) + vphaddubq m0, [r0] + vphaddubq m1, [r0+r1] + vphaddubq m2, [r0+r1*2] + vphaddubq m3, [r0+r3] +%else + mova m0, [r0] +%if mmsize == 8 + mova m1, [r0+8] +%else + mova m1, [r0+r1] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif ; cpuflag(xop) + paddw m1, m0 + paddw m3, m2 + paddw m3, m1 + paddw m4, m3 +%if mmsize == 8 + add r0, r1 +%else + lea r0, [r0+r1*%4] +%endif + dec r2 + jne .loop +%if cpuflag(xop) + pshufd m0, m4, q0032 + paddd m4, m0 +%else + HADDW m4, m5 +%endif + movd eax, m4 + RET +%endmacro + +INIT_MMX mmx +PIX_SUM16 0, 16, 3, 0 +INIT_XMM sse2 +PIX_SUM16 6, 8, 3, 2 +%if HAVE_XOP_EXTERNAL +INIT_XMM xop +PIX_SUM16 5, 4, 4, 4 +%endif + +; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) +; %1 = number of xmm registers used +; %2 = number of loops +%macro PIX_NORM1 2 +cglobal pix_norm1, 2, 3, %1 + movsxdifnidn r1, r1d + mov r2, %2 + pxor m0, m0 + pxor m5, m5 +.loop: + mova m2, [r0+0] +%if mmsize == 8 + mova m3, [r0+8] +%else + mova m3, [r0+r1] +%endif + punpckhbw m1, m2, m0 + punpcklbw m2, m0 + punpckhbw m4, m3, m0 + punpcklbw m3, m0 + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m2, m1 + paddd m4, m3 + paddd m5, m2 + paddd m5, m4 +%if mmsize == 8 + add r0, r1 +%else + lea r0, [r0+r1*2] +%endif + dec r2 + jne .loop + HADDD m5, m1 + movd eax, m5 + RET +%endmacro + +INIT_MMX mmx +PIX_NORM1 0, 16 +INIT_XMM sse2 +PIX_NORM1 6, 8 + diff --git a/libavcodec/x86/mpegvideoencdsp_init.c b/libavcodec/x86/mpegvideoencdsp_init.c index d7650ec0e1..16841893a4 100644 --- a/libavcodec/x86/mpegvideoencdsp_init.c +++ b/libavcodec/x86/mpegvideoencdsp_init.c @@ -22,6 +22,12 @@ #include "libavcodec/avcodec.h" #include "libavcodec/mpegvideoencdsp.h" +int ff_pix_sum16_mmx(uint8_t *pix, int line_size); +int ff_pix_sum16_sse2(uint8_t *pix, int line_size); +int ff_pix_sum16_xop(uint8_t *pix, int line_size); +int ff_pix_norm1_mmx(uint8_t *pix, int line_size); +int ff_pix_norm1_sse2(uint8_t *pix, int line_size); + #if HAVE_INLINE_ASM #define PHADDD(a, t) \ @@ -95,9 +101,24 @@ av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, AVCodecContext *avctx) { -#if HAVE_INLINE_ASM int cpu_flags = av_get_cpu_flags(); + if (EXTERNAL_MMX(cpu_flags)) { + c->pix_sum = ff_pix_sum16_mmx; + c->pix_norm1 = ff_pix_norm1_mmx; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + c->pix_sum = ff_pix_sum16_sse2; + c->pix_norm1 = ff_pix_norm1_sse2; + } + + if (EXTERNAL_XOP(cpu_flags)) { + c->pix_sum = ff_pix_sum16_xop; + } + +#if HAVE_INLINE_ASM + if (INLINE_MMX(cpu_flags)) { if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { c->try_8x8basis = try_8x8basis_mmx;