From 3efd4952dfcc0e452d28910758876884925c6175 Mon Sep 17 00:00:00 2001 From: Romain Dolbeau Date: Fri, 4 Jul 2003 09:39:05 +0000 Subject: [PATCH] PPC fixes & clean-up patch by (Romain Dolbeau ) Originally committed as revision 2008 to svn://svn.ffmpeg.org/ffmpeg/trunk --- configure | 15 ++++++--- libavcodec/ppc/dsputil_altivec.c | 10 +++--- libavcodec/ppc/dsputil_ppc.c | 43 ++++++++++++++++-------- libavcodec/ppc/dsputil_ppc.h | 57 ++++++++++++++++++++++++-------- libavcodec/ppc/gcc_fixes.h | 7 ---- libavcodec/ppc/gmc_altivec.c | 11 +++--- 6 files changed, 96 insertions(+), 47 deletions(-) diff --git a/configure b/configure index a92137ec7b..44e183a399 100755 --- a/configure +++ b/configure @@ -380,18 +380,25 @@ if test $tune != "generic"; then TUNECPU=ppc604 ;; G3|75*|ppc75*|PowerPC75*) - CFLAGS="$CFLAGS -mcpu=750" + CFLAGS="$CFLAGS -mcpu=750 -mtune=750" if test $altivec = "yes"; then echo "WARNING: tuning for PPC75x but altivec enabled !"; fi TUNECPU=ppc750 ;; - G4|74*|ppc74*|PowerPC74*) - CFLAGS="$CFLAGS -mcpu=7400" + G4|745*|ppc745*|PowerPC745*) + CFLAGS="$CFLAGS -mcpu=7450 -mtune=7450" + if test $altivec = "no"; then + echo "WARNING: tuning for PPC745x but altivec disabled !"; + fi + TUNECPU=ppc7450 + ;; + 74*|ppc74*|PowerPC74*) + CFLAGS="$CFLAGS -mcpu=7400 -mtune=7400" if test $altivec = "no"; then echo "WARNING: tuning for PPC74xx but altivec disabled !"; fi - TUNECPU=ppc7400 + TUNECPU=ppc7450 ;; G5|970|ppc970|PowerPC970|power4*|Power4*) CFLAGS="$CFLAGS -mcpu=970 -mtune=970 -mpowerpc64 -force_cpusubtype_ALL " diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c index 32e881b703..2c71d8e7bf 100644 --- a/libavcodec/ppc/dsputil_altivec.c +++ b/libavcodec/ppc/dsputil_altivec.c @@ -1086,7 +1086,9 @@ POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); pixelssum3, pixelssum4, temp4; register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - + +POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); + temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); @@ -1109,7 +1111,6 @@ POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vctwo); -POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); for (i = 0; i < h ; i++) { blockv = vec_ld(0, block); @@ -1207,7 +1208,9 @@ POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - + +POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); + temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); @@ -1230,7 +1233,6 @@ POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vcone); -POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); for (i = 0; i < h ; i++) { blockv = vec_ld(0, block); diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c index 8a5de9d74f..87772b4580 100644 --- a/libavcodec/ppc/dsputil_ppc.c +++ b/libavcodec/ppc/dsputil_ppc.c @@ -61,7 +61,8 @@ static unsigned char* perfname[] = { "clear_blocks_dcbz128_ppc" }; #ifdef POWERPC_PERF_USE_PMC -unsigned long long perfdata_miss[powerpc_perf_total][powerpc_data_total]; +unsigned long long perfdata_pmc2[powerpc_perf_total][powerpc_data_total]; +unsigned long long perfdata_pmc3[powerpc_perf_total][powerpc_data_total]; #endif #include #endif @@ -86,14 +87,22 @@ void powerpc_display_perf_report(void) (double)perfdata[i][powerpc_data_num], perfdata[i][powerpc_data_num]); #ifdef POWERPC_PERF_USE_PMC - if (perfdata_miss[i][powerpc_data_num] != (unsigned long long)0) + if (perfdata_pmc2[i][powerpc_data_num] != (unsigned long long)0) fprintf(stderr, " Function \"%s\" (pmc2):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n", perfname[i], - perfdata_miss[i][powerpc_data_min], - perfdata_miss[i][powerpc_data_max], - (double)perfdata_miss[i][powerpc_data_sum] / - (double)perfdata_miss[i][powerpc_data_num], - perfdata_miss[i][powerpc_data_num]); + perfdata_pmc2[i][powerpc_data_min], + perfdata_pmc2[i][powerpc_data_max], + (double)perfdata_pmc2[i][powerpc_data_sum] / + (double)perfdata_pmc2[i][powerpc_data_num], + perfdata_pmc2[i][powerpc_data_num]); + if (perfdata_pmc3[i][powerpc_data_num] != (unsigned long long)0) + fprintf(stderr, " Function \"%s\" (pmc3):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n", + perfname[i], + perfdata_pmc3[i][powerpc_data_min], + perfdata_pmc3[i][powerpc_data_max], + (double)perfdata_pmc3[i][powerpc_data_sum] / + (double)perfdata_pmc3[i][powerpc_data_num], + perfdata_pmc3[i][powerpc_data_num]); #endif } } @@ -139,7 +148,7 @@ POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz32, 1); i += 16; } for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) { - asm volatile("dcbz %0,%1" : : "r" (i), "r" (blocks) : "memory"); + asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); } if (misal) { ((unsigned long*)blocks)[188] = 0L; @@ -172,7 +181,7 @@ POWERPC_TBL_START_COUNT(powerpc_clear_blocks_dcbz128, 1); } else for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { - asm volatile("dcbzl %0,%1" : : "r" (i), "r" (blocks) : "memory"); + asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); } #else memset(blocks, 0, sizeof(DCTELEM)*6*64); @@ -209,7 +218,9 @@ long check_dcbzl_effect(void) memset(fakedata, 0xFF, 1024); - asm volatile("dcbzl %0, %1" : : "r" (fakedata_middle), "r" (zero)); + /* below the constraint "b" seems to mean "Address base register" + in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ + asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); for (i = 0; i < 1024 ; i ++) { @@ -300,10 +311,14 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) perfdata[i][powerpc_data_sum] = 0x0000000000000000; perfdata[i][powerpc_data_num] = 0x0000000000000000; #ifdef POWERPC_PERF_USE_PMC - perfdata_miss[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF; - perfdata_miss[i][powerpc_data_max] = 0x0000000000000000; - perfdata_miss[i][powerpc_data_sum] = 0x0000000000000000; - perfdata_miss[i][powerpc_data_num] = 0x0000000000000000; + perfdata_pmc2[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF; + perfdata_pmc2[i][powerpc_data_max] = 0x0000000000000000; + perfdata_pmc2[i][powerpc_data_sum] = 0x0000000000000000; + perfdata_pmc2[i][powerpc_data_num] = 0x0000000000000000; + perfdata_pmc3[i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFF; + perfdata_pmc3[i][powerpc_data_max] = 0x0000000000000000; + perfdata_pmc3[i][powerpc_data_sum] = 0x0000000000000000; + perfdata_pmc3[i][powerpc_data_num] = 0x0000000000000000; #endif /* POWERPC_PERF_USE_PMC */ } } diff --git a/libavcodec/ppc/dsputil_ppc.h b/libavcodec/ppc/dsputil_ppc.h index 0e619ee17a..8c325fbc75 100644 --- a/libavcodec/ppc/dsputil_ppc.h +++ b/libavcodec/ppc/dsputil_ppc.h @@ -19,6 +19,17 @@ #ifndef _DSPUTIL_PPC_ #define _DSPUTIL_PPC_ +#ifdef CONFIG_DARWIN +/* The Apple assembler shipped w/ gcc-3.3 knows about DCBZL, previous assemblers don't + We assume here that the Darwin GCC is from Apple.... */ +#if (__GNUC__ * 100 + __GNUC_MINOR__ < 303) +#define NO_DCBZL +#endif +#else /* CONFIG_DARWIN */ +/* I don't think any non-Apple assembler knows about DCBZL */ +#define NO_DCBZL +#endif /* CONFIG_DARWIN */ + #ifdef POWERPC_TBL_PERFORMANCE_REPORT void powerpc_display_perf_report(void); /* if you add to the enum below, also add to the perfname array @@ -49,7 +60,8 @@ enum powerpc_data_index { }; extern unsigned long long perfdata[powerpc_perf_total][powerpc_data_total]; #ifdef POWERPC_PERF_USE_PMC -extern unsigned long long perfdata_miss[powerpc_perf_total][powerpc_data_total]; +extern unsigned long long perfdata_pmc2[powerpc_perf_total][powerpc_data_total]; +extern unsigned long long perfdata_pmc3[powerpc_perf_total][powerpc_data_total]; #endif #ifndef POWERPC_PERF_USE_PMC @@ -75,12 +87,17 @@ extern unsigned long long perfdata_miss[powerpc_perf_total][powerpc_data_total]; #else /* POWERPC_PERF_USE_PMC */ #define POWERPC_GET_CYCLES(a) asm volatile("mfspr %0, 937" : "=r" (a)) -#define POWERPC_GET_MISS(a) asm volatile("mfspr %0, 938" : "=r" (a)) -#define POWERPC_TBL_DECLARE(a, cond) register unsigned long cycles_start, cycles_stop, miss_start, miss_stop -#define POWERPC_TBL_START_COUNT(a, cond) do { POWERPC_GET_MISS(miss_start); POWERPC_GET_CYCLES(cycles_start); } while (0) +#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a)) +#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 941" : "=r" (a)) +#define POWERPC_TBL_DECLARE(a, cond) register unsigned long cycles_start, cycles_stop, pmc2_start, pmc2_stop, pmc3_start, pmc3_stop +#define POWERPC_TBL_START_COUNT(a, cond) do { \ + POWERPC_GET_PMC3(pmc3_start); \ + POWERPC_GET_PMC2(pmc2_start); \ + POWERPC_GET_CYCLES(cycles_start); } while (0) #define POWERPC_TBL_STOP_COUNT(a, cond) do { \ POWERPC_GET_CYCLES(cycles_stop); \ - POWERPC_GET_MISS(miss_stop); \ + POWERPC_GET_PMC2(pmc2_stop); \ + POWERPC_GET_PMC3(pmc3_stop); \ if (cycles_stop >= cycles_start) \ { \ unsigned long diff = \ @@ -95,18 +112,32 @@ extern unsigned long long perfdata_miss[powerpc_perf_total][powerpc_data_total]; perfdata[a][powerpc_data_num] ++; \ } \ } \ - if (miss_stop >= miss_start) \ + if (pmc2_stop >= pmc2_start) \ { \ unsigned long diff = \ - miss_stop - miss_start; \ + pmc2_stop - pmc2_start; \ if (cond) \ { \ - if (diff < perfdata_miss[a][powerpc_data_min]) \ - perfdata_miss[a][powerpc_data_min] = diff; \ - if (diff > perfdata_miss[a][powerpc_data_max]) \ - perfdata_miss[a][powerpc_data_max] = diff; \ - perfdata_miss[a][powerpc_data_sum] += diff; \ - perfdata_miss[a][powerpc_data_num] ++; \ + if (diff < perfdata_pmc2[a][powerpc_data_min]) \ + perfdata_pmc2[a][powerpc_data_min] = diff; \ + if (diff > perfdata_pmc2[a][powerpc_data_max]) \ + perfdata_pmc2[a][powerpc_data_max] = diff; \ + perfdata_pmc2[a][powerpc_data_sum] += diff; \ + perfdata_pmc2[a][powerpc_data_num] ++; \ + } \ + } \ + if (pmc3_stop >= pmc3_start) \ + { \ + unsigned long diff = \ + pmc3_stop - pmc3_start; \ + if (cond) \ + { \ + if (diff < perfdata_pmc3[a][powerpc_data_min]) \ + perfdata_pmc3[a][powerpc_data_min] = diff; \ + if (diff > perfdata_pmc3[a][powerpc_data_max]) \ + perfdata_pmc3[a][powerpc_data_max] = diff; \ + perfdata_pmc3[a][powerpc_data_sum] += diff; \ + perfdata_pmc3[a][powerpc_data_num] ++; \ } \ } \ } while (0) diff --git a/libavcodec/ppc/gcc_fixes.h b/libavcodec/ppc/gcc_fixes.h index 653378d5bc..8fac4ab7ca 100644 --- a/libavcodec/ppc/gcc_fixes.h +++ b/libavcodec/ppc/gcc_fixes.h @@ -13,15 +13,8 @@ #ifdef CONFIG_DARWIN #define AVV(x...) (x) -/* The Apple assembler shipped w/ gcc-3.3 knows about DCBZL, previous assemblers don't - We assume here that the Darwin GCC is from Apple.... */ -#if (__GNUC__ * 100 + __GNUC_MINOR__ < 303) -#define NO_DCBZL -#endif #else #define AVV(x...) {x} -/* I don't think any non-Apple assembler knows about DCBZL */ -#define NO_DCBZL #if (__GNUC__ * 100 + __GNUC_MINOR__ < 303) /* This code was provided to me by Bartosch Pixa diff --git a/libavcodec/ppc/gmc_altivec.c b/libavcodec/ppc/gmc_altivec.c index 18d52bbc52..9b141078bb 100644 --- a/libavcodec/ppc/gmc_altivec.c +++ b/libavcodec/ppc/gmc_altivec.c @@ -28,9 +28,10 @@ altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, to preserve proper dst alignement. */ +#define GMC1_PERF_COND (h==8) void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { -POWERPC_TBL_DECLARE(altivec_gmc1_num, h == 8); +POWERPC_TBL_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); #ifdef ALTIVEC_USE_REFERENCE_C_CODE const int A=(16-x16)*(16-y16); const int B=( x16)*(16-y16); @@ -38,7 +39,7 @@ POWERPC_TBL_DECLARE(altivec_gmc1_num, h == 8); const int D=( x16)*( y16); int i; -POWERPC_TBL_START_COUNT(altivec_gmc1_num, h == 8); +POWERPC_TBL_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); for(i=0; i