mirror of
https://github.com/mpv-player/mpv
synced 2025-01-01 04:36:24 +01:00
d1a54d5696
These includes were added earlier after the definition of the DECLARE_ALIGNED and DECLARE_ASM_CONST macros were moved to this header in FFmpeg. However now those macros are again in the public mem.h header, and there is no need to include internal.h.
425 lines
14 KiB
C
425 lines
14 KiB
C
/*
|
|
* Discrete Cosine Tansform (DCT) for SSE
|
|
* Copyright (c) 2006 Zuxy MENG <zuxy.meng@gmail.com>
|
|
* based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c
|
|
* and mp3lib/dct64_mmx.c
|
|
*/
|
|
|
|
#include "libavutil/mem.h"
|
|
|
|
typedef float real;
|
|
|
|
extern float __attribute__((aligned(16))) costab_mmx[];
|
|
|
|
static const int ppnn[4] __attribute__((aligned(16))) =
|
|
{ 0, 0, 1 << 31, 1 << 31 };
|
|
|
|
static const int pnpn[4] __attribute__((aligned(16))) =
|
|
{ 0, 1 << 31, 0, 1 << 31 };
|
|
|
|
static const int nnnn[4] __attribute__((aligned(16))) =
|
|
{ 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
|
|
|
|
void dct64_sse(short *out0,short *out1,real *c)
|
|
{
|
|
DECLARE_ALIGNED(16, real, b1[0x20]);
|
|
DECLARE_ALIGNED(16, real, b2[0x20]);
|
|
static real const one = 1.f;
|
|
|
|
{
|
|
real *costab = costab_mmx;
|
|
int i;
|
|
|
|
for (i = 0; i < 0x20 / 2; i += 4)
|
|
{
|
|
__asm__(
|
|
"movaps %2, %%xmm3\n\t"
|
|
"shufps $27, %%xmm3, %%xmm3\n\t"
|
|
"movaps %3, %%xmm1\n\t"
|
|
"movaps %%xmm1, %%xmm4\n\t"
|
|
"movaps %4, %%xmm2\n\t"
|
|
"shufps $27, %%xmm4, %%xmm4\n\t"
|
|
"movaps %%xmm2, %%xmm0\n\t"
|
|
"shufps $27, %%xmm0, %%xmm0\n\t"
|
|
"addps %%xmm0, %%xmm1\n\t"
|
|
"movaps %%xmm1, %0\n\t"
|
|
"subps %%xmm2, %%xmm4\n\t"
|
|
"mulps %%xmm3, %%xmm4\n\t"
|
|
"movaps %%xmm4, %1\n\t"
|
|
:"=m"(*(b1 + i)), "=m"(*(b1 + 0x1c - i))
|
|
:"m"(*(costab + i)), "m"(*(c + i)), "m"(*(c + 0x1c - i))
|
|
);
|
|
}
|
|
}
|
|
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < 0x20; i += 0x10)
|
|
{
|
|
__asm__(
|
|
"movaps %4, %%xmm1\n\t"
|
|
"movaps %5, %%xmm3\n\t"
|
|
"movaps %6, %%xmm4\n\t"
|
|
"movaps %7, %%xmm6\n\t"
|
|
"movaps %%xmm1, %%xmm7\n\t"
|
|
"shufps $27, %%xmm7, %%xmm7\n\t"
|
|
"movaps %%xmm3, %%xmm5\n\t"
|
|
"shufps $27, %%xmm5, %%xmm5\n\t"
|
|
"movaps %%xmm4, %%xmm2\n\t"
|
|
"shufps $27, %%xmm2, %%xmm2\n\t"
|
|
"movaps %%xmm6, %%xmm0\n\t"
|
|
"shufps $27, %%xmm0, %%xmm0\n\t"
|
|
"addps %%xmm0, %%xmm1\n\t"
|
|
"movaps %%xmm1, %0\n\t"
|
|
"addps %%xmm2, %%xmm3\n\t"
|
|
"movaps %%xmm3, %1\n\t"
|
|
"subps %%xmm4, %%xmm5\n\t"
|
|
"movaps %%xmm5, %2\n\t"
|
|
"subps %%xmm6, %%xmm7\n\t"
|
|
"movaps %%xmm7, %3\n\t"
|
|
:"=m"(*(b2 + i)), "=m"(*(b2 + i + 4)), "=m"(*(b2 + i + 8)), "=m"(*(b2 + i + 12))
|
|
:"m"(*(b1 + i)), "m"(*(b1 + i + 4)), "m"(*(b1 + i + 8)), "m"(*(b1 + i + 12))
|
|
);
|
|
}
|
|
}
|
|
|
|
{
|
|
real *costab = costab_mmx + 16;
|
|
__asm__(
|
|
"movaps %4, %%xmm0\n\t"
|
|
"movaps %5, %%xmm1\n\t"
|
|
"movaps %8, %%xmm4\n\t"
|
|
"xorps %%xmm6, %%xmm6\n\t"
|
|
"shufps $27, %%xmm4, %%xmm4\n\t"
|
|
"mulps %%xmm4, %%xmm1\n\t"
|
|
"movaps %9, %%xmm2\n\t"
|
|
"xorps %%xmm7, %%xmm7\n\t"
|
|
"shufps $27, %%xmm2, %%xmm2\n\t"
|
|
"mulps %%xmm2, %%xmm0\n\t"
|
|
"movaps %%xmm0, %0\n\t"
|
|
"movaps %%xmm1, %1\n\t"
|
|
"movaps %6, %%xmm3\n\t"
|
|
"mulps %%xmm2, %%xmm3\n\t"
|
|
"subps %%xmm3, %%xmm6\n\t"
|
|
"movaps %%xmm6, %2\n\t"
|
|
"movaps %7, %%xmm5\n\t"
|
|
"mulps %%xmm4, %%xmm5\n\t"
|
|
"subps %%xmm5, %%xmm7\n\t"
|
|
"movaps %%xmm7, %3\n\t"
|
|
:"=m"(*(b2 + 8)), "=m"(*(b2 + 0xc)), "=m"(*(b2 + 0x18)), "=m"(*(b2 + 0x1c))
|
|
:"m"(*(b2 + 8)), "m"(*(b2 + 0xc)), "m"(*(b2 + 0x18)), "m"(*(b2 + 0x1c)), "m"(*costab), "m"(*(costab + 4))
|
|
);
|
|
}
|
|
|
|
{
|
|
real *costab = costab_mmx + 24;
|
|
int i;
|
|
|
|
__asm__(
|
|
"movaps %0, %%xmm0\n\t"
|
|
"shufps $27, %%xmm0, %%xmm0\n\t"
|
|
"movaps %1, %%xmm5\n\t"
|
|
"movaps %%xmm5, %%xmm6\n\t"
|
|
:
|
|
:"m"(*costab), "m"(*nnnn)
|
|
);
|
|
|
|
for (i = 0; i < 0x20; i += 8)
|
|
{
|
|
__asm__(
|
|
"movaps %2, %%xmm2\n\t"
|
|
"movaps %3, %%xmm3\n\t"
|
|
"movaps %%xmm2, %%xmm4\n\t"
|
|
"xorps %%xmm5, %%xmm6\n\t"
|
|
"shufps $27, %%xmm4, %%xmm4\n\t"
|
|
"movaps %%xmm3, %%xmm1\n\t"
|
|
"shufps $27, %%xmm1, %%xmm1\n\t"
|
|
"addps %%xmm1, %%xmm2\n\t"
|
|
"movaps %%xmm2, %0\n\t"
|
|
"subps %%xmm3, %%xmm4\n\t"
|
|
"xorps %%xmm6, %%xmm4\n\t"
|
|
"mulps %%xmm0, %%xmm4\n\t"
|
|
"movaps %%xmm4, %1\n\t"
|
|
:"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
|
|
:"m"(*(b2 + i)), "m"(*(b2 + i + 4))
|
|
);
|
|
}
|
|
}
|
|
|
|
{
|
|
int i;
|
|
|
|
__asm__(
|
|
"movss %0, %%xmm1\n\t"
|
|
"movss %1, %%xmm0\n\t"
|
|
"movaps %%xmm1, %%xmm3\n\t"
|
|
"unpcklps %%xmm0, %%xmm3\n\t"
|
|
"movss %2, %%xmm2\n\t"
|
|
"movaps %%xmm1, %%xmm0\n\t"
|
|
"unpcklps %%xmm2, %%xmm0\n\t"
|
|
"unpcklps %%xmm3, %%xmm0\n\t"
|
|
"movaps %3, %%xmm2\n\t"
|
|
:
|
|
:"m"(one), "m"(costab_mmx[28]), "m"(costab_mmx[29]), "m"(*ppnn)
|
|
);
|
|
|
|
for (i = 0; i < 0x20; i += 8)
|
|
{
|
|
__asm__(
|
|
"movaps %2, %%xmm3\n\t"
|
|
"movaps %%xmm3, %%xmm4\n\t"
|
|
"shufps $20, %%xmm4, %%xmm4\n\t"
|
|
"shufps $235, %%xmm3, %%xmm3\n\t"
|
|
"xorps %%xmm2, %%xmm3\n\t"
|
|
"addps %%xmm3, %%xmm4\n\t"
|
|
"mulps %%xmm0, %%xmm4\n\t"
|
|
"movaps %%xmm4, %0\n\t"
|
|
"movaps %3, %%xmm6\n\t"
|
|
"movaps %%xmm6, %%xmm5\n\t"
|
|
"shufps $27, %%xmm5, %%xmm5\n\t"
|
|
"xorps %%xmm2, %%xmm5\n\t"
|
|
"addps %%xmm5, %%xmm6\n\t"
|
|
"mulps %%xmm0, %%xmm6\n\t"
|
|
"movaps %%xmm6, %1\n\t"
|
|
:"=m"(*(b2 + i)), "=m"(*(b2 + i + 4))
|
|
:"m"(*(b1 + i)), "m"(*(b1 + i + 4))
|
|
);
|
|
}
|
|
}
|
|
|
|
{
|
|
int i;
|
|
__asm__(
|
|
"movss %0, %%xmm0\n\t"
|
|
"movaps %%xmm1, %%xmm2\n\t"
|
|
"movaps %%xmm0, %%xmm7\n\t"
|
|
"unpcklps %%xmm1, %%xmm2\n\t"
|
|
"unpcklps %%xmm0, %%xmm7\n\t"
|
|
"movaps %1, %%xmm0\n\t"
|
|
"unpcklps %%xmm7, %%xmm2\n\t"
|
|
:
|
|
:"m"(costab_mmx[30]), "m"(*pnpn)
|
|
);
|
|
|
|
for (i = 0x8; i < 0x20; i += 8)
|
|
{
|
|
__asm__ volatile (
|
|
"movaps %2, %%xmm1\n\t"
|
|
"movaps %%xmm1, %%xmm3\n\t"
|
|
"shufps $224, %%xmm3, %%xmm3\n\t"
|
|
"shufps $181, %%xmm1, %%xmm1\n\t"
|
|
"xorps %%xmm0, %%xmm1\n\t"
|
|
"addps %%xmm1, %%xmm3\n\t"
|
|
"mulps %%xmm2, %%xmm3\n\t"
|
|
"movaps %%xmm3, %0\n\t"
|
|
"movaps %3, %%xmm4\n\t"
|
|
"movaps %%xmm4, %%xmm5\n\t"
|
|
"shufps $224, %%xmm5, %%xmm5\n\t"
|
|
"shufps $181, %%xmm4, %%xmm4\n\t"
|
|
"xorps %%xmm0, %%xmm4\n\t"
|
|
"addps %%xmm4, %%xmm5\n\t"
|
|
"mulps %%xmm2, %%xmm5\n\t"
|
|
"movaps %%xmm5, %1\n\t"
|
|
:"=m"(*(b1 + i)), "=m"(*(b1 + i + 4))
|
|
:"m"(*(b2 + i)), "m"(*(b2 + i + 4))
|
|
:"memory"
|
|
);
|
|
}
|
|
for (i = 0x8; i < 0x20; i += 8)
|
|
{
|
|
b1[i + 2] += b1[i + 3];
|
|
b1[i + 6] += b1[i + 7];
|
|
b1[i + 4] += b1[i + 6];
|
|
b1[i + 6] += b1[i + 5];
|
|
b1[i + 5] += b1[i + 7];
|
|
}
|
|
}
|
|
|
|
#if 0
|
|
/* Reference C code */
|
|
|
|
/*
|
|
Should run faster than x87 asm, given that the compiler is sane.
|
|
However, the C code dosen't round with saturation (0x7fff for too
|
|
large positive float, 0x8000 for too small negative float). You
|
|
can hear the difference if you listen carefully.
|
|
*/
|
|
|
|
out0[256] = (short)(b2[0] + b2[1]);
|
|
out0[0] = (short)((b2[0] - b2[1]) * costab_mmx[30]);
|
|
out1[128] = (short)((b2[3] - b2[2]) * costab_mmx[30]);
|
|
out0[128] = (short)((b2[3] - b2[2]) * costab_mmx[30] + b2[3] + b2[2]);
|
|
out1[192] = (short)((b2[7] - b2[6]) * costab_mmx[30]);
|
|
out0[192] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + b2[4] + b2[5]);
|
|
out0[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + b2[6] + b2[7] + (b2[4] - b2[5]) * costab_mmx[30]);
|
|
out1[64] = (short)((b2[7] - b2[6]) * costab_mmx[30] + (b2[4] - b2[5]) * costab_mmx[30]);
|
|
|
|
out0[224] = (short)(b1[8] + b1[12]);
|
|
out0[160] = (short)(b1[12] + b1[10]);
|
|
out0[96] = (short)(b1[10] + b1[14]);
|
|
out0[32] = (short)(b1[14] + b1[9]);
|
|
out1[32] = (short)(b1[9] + b1[13]);
|
|
out1[96] = (short)(b1[13] + b1[11]);
|
|
out1[224] = (short)b1[15];
|
|
out1[160] = (short)(b1[15] + b1[11]);
|
|
out0[240] = (short)(b1[24] + b1[28] + b1[16]);
|
|
out0[208] = (short)(b1[24] + b1[28] + b1[20]);
|
|
out0[176] = (short)(b1[28] + b1[26] + b1[20]);
|
|
out0[144] = (short)(b1[28] + b1[26] + b1[18]);
|
|
out0[112] = (short)(b1[26] + b1[30] + b1[18]);
|
|
out0[80] = (short)(b1[26] + b1[30] + b1[22]);
|
|
out0[48] = (short)(b1[30] + b1[25] + b1[22]);
|
|
out0[16] = (short)(b1[30] + b1[25] + b1[17]);
|
|
out1[16] = (short)(b1[25] + b1[29] + b1[17]);
|
|
out1[48] = (short)(b1[25] + b1[29] + b1[21]);
|
|
out1[80] = (short)(b1[29] + b1[27] + b1[21]);
|
|
out1[112] = (short)(b1[29] + b1[27] + b1[19]);
|
|
out1[144] = (short)(b1[27] + b1[31] + b1[19]);
|
|
out1[176] = (short)(b1[27] + b1[31] + b1[23]);
|
|
out1[240] = (short)(b1[31]);
|
|
out1[208] = (short)(b1[31] + b1[23]);
|
|
|
|
#else
|
|
/*
|
|
To do saturation efficiently in x86 we can use fist(t)(p),
|
|
pf2iw, or packssdw. We use fist(p) here.
|
|
*/
|
|
__asm__(
|
|
"flds %0\n\t"
|
|
"flds (%2)\n\t"
|
|
"fadds 4(%2)\n\t"
|
|
"fistp 512(%3)\n\t"
|
|
|
|
"flds (%2)\n\t"
|
|
"fsubs 4(%2)\n\t"
|
|
"fmul %%st(1)\n\t"
|
|
"fistp (%3)\n\t"
|
|
|
|
"flds 12(%2)\n\t"
|
|
"fsubs 8(%2)\n\t"
|
|
"fmul %%st(1)\n\t"
|
|
"fist 256(%4)\n\t"
|
|
"fadds 12(%2)\n\t"
|
|
"fadds 8(%2)\n\t"
|
|
"fistp 256(%3)\n\t"
|
|
|
|
"flds 16(%2)\n\t"
|
|
"fsubs 20(%2)\n\t"
|
|
"fmul %%st(1)\n\t"
|
|
|
|
"flds 28(%2)\n\t"
|
|
"fsubs 24(%2)\n\t"
|
|
"fmul %%st(2)\n\t"
|
|
"fist 384(%4)\n\t"
|
|
"fld %%st(0)\n\t"
|
|
"fadds 24(%2)\n\t"
|
|
"fadds 28(%2)\n\t"
|
|
"fld %%st(0)\n\t"
|
|
"fadds 16(%2)\n\t"
|
|
"fadds 20(%2)\n\t"
|
|
"fistp 384(%3)\n\t"
|
|
"fadd %%st(2)\n\t"
|
|
"fistp 128(%3)\n\t"
|
|
"faddp %%st(1)\n\t"
|
|
"fistp 128(%4)\n\t"
|
|
|
|
"flds 32(%1)\n\t"
|
|
"fadds 48(%1)\n\t"
|
|
"fistp 448(%3)\n\t"
|
|
|
|
"flds 48(%1)\n\t"
|
|
"fadds 40(%1)\n\t"
|
|
"fistp 320(%3)\n\t"
|
|
|
|
"flds 40(%1)\n\t"
|
|
"fadds 56(%1)\n\t"
|
|
"fistp 192(%3)\n\t"
|
|
|
|
"flds 56(%1)\n\t"
|
|
"fadds 36(%1)\n\t"
|
|
"fistp 64(%3)\n\t"
|
|
|
|
"flds 36(%1)\n\t"
|
|
"fadds 52(%1)\n\t"
|
|
"fistp 64(%4)\n\t"
|
|
|
|
"flds 52(%1)\n\t"
|
|
"fadds 44(%1)\n\t"
|
|
"fistp 192(%4)\n\t"
|
|
|
|
"flds 60(%1)\n\t"
|
|
"fist 448(%4)\n\t"
|
|
"fadds 44(%1)\n\t"
|
|
"fistp 320(%4)\n\t"
|
|
|
|
"flds 96(%1)\n\t"
|
|
"fadds 112(%1)\n\t"
|
|
"fld %%st(0)\n\t"
|
|
"fadds 64(%1)\n\t"
|
|
"fistp 480(%3)\n\t"
|
|
"fadds 80(%1)\n\t"
|
|
"fistp 416(%3)\n\t"
|
|
|
|
"flds 112(%1)\n\t"
|
|
"fadds 104(%1)\n\t"
|
|
"fld %%st(0)\n\t"
|
|
"fadds 80(%1)\n\t"
|
|
"fistp 352(%3)\n\t"
|
|
"fadds 72(%1)\n\t"
|
|
"fistp 288(%3)\n\t"
|
|
|
|
"flds 104(%1)\n\t"
|
|
"fadds 120(%1)\n\t"
|
|
"fld %%st(0)\n\t"
|
|
"fadds 72(%1)\n\t"
|
|
"fistp 224(%3)\n\t"
|
|
"fadds 88(%1)\n\t"
|
|
"fistp 160(%3)\n\t"
|
|
|
|
"flds 120(%1)\n\t"
|
|
"fadds 100(%1)\n\t"
|
|
"fld %%st(0)\n\t"
|
|
"fadds 88(%1)\n\t"
|
|
"fistp 96(%3)\n\t"
|
|
"fadds 68(%1)\n\t"
|
|
"fistp 32(%3)\n\t"
|
|
|
|
"flds 100(%1)\n\t"
|
|
"fadds 116(%1)\n\t"
|
|
"fld %%st(0)\n\t"
|
|
"fadds 68(%1)\n\t"
|
|
"fistp 32(%4)\n\t"
|
|
"fadds 84(%1)\n\t"
|
|
"fistp 96(%4)\n\t"
|
|
|
|
"flds 116(%1)\n\t"
|
|
"fadds 108(%1)\n\t"
|
|
"fld %%st(0)\n\t"
|
|
"fadds 84(%1)\n\t"
|
|
"fistp 160(%4)\n\t"
|
|
"fadds 76(%1)\n\t"
|
|
"fistp 224(%4)\n\t"
|
|
|
|
"flds 108(%1)\n\t"
|
|
"fadds 124(%1)\n\t"
|
|
"fld %%st(0)\n\t"
|
|
"fadds 76(%1)\n\t"
|
|
"fistp 288(%4)\n\t"
|
|
"fadds 92(%1)\n\t"
|
|
"fistp 352(%4)\n\t"
|
|
|
|
"flds 124(%1)\n\t"
|
|
"fist 480(%4)\n\t"
|
|
"fadds 92(%1)\n\t"
|
|
"fistp 416(%4)\n\t"
|
|
".byte 0xdf, 0xc0\n\t" // ffreep %%st(0)
|
|
:
|
|
:"m"(costab_mmx[30]), "r"(b1), "r"(b2), "r"(out0), "r"(out1)
|
|
:"memory"
|
|
);
|
|
#endif
|
|
out1[0] = out0[0];
|
|
}
|
|
|