avcodec: Remove DCT, FFT, MDCT and RDFT

They were replaced by TX from libavutil; the tremendous work
to get to this point (both creating TX as well as porting
the users of the components removed in this commit) was
completely performed by Lynne alone.

Removing the subsystems from configure may break some command lines,
because the --disable-fft etc. options are no longer recognized.

Co-authored-by: Lynne <dev@lynne.ee>
Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
This commit is contained in:
Andreas Rheinhardt 2023-09-28 19:57:36 +02:00
parent d9464f3e34
commit 6f7bf64dbc
52 changed files with 2 additions and 8799 deletions

11
configure vendored
View File

@ -136,13 +136,9 @@ Component options:
--disable-w32threads disable Win32 threads [autodetect]
--disable-os2threads disable OS/2 threads [autodetect]
--disable-network disable network support [no]
--disable-dct disable DCT code
--disable-dwt disable DWT code
--disable-error-resilience disable error resilience code
--disable-lsp disable LSP code
--disable-mdct disable MDCT code
--disable-rdft disable RDFT code
--disable-fft disable FFT code
--disable-faan disable floating point AAN (I)DCT code
--disable-pixelutils disable pixel utils in libavutil
@ -2004,17 +2000,13 @@ PROGRAM_LIST="
"
SUBSYSTEM_LIST="
dct
dwt
error_resilience
faan
fast_unaligned
fft
lsp
mdct
pixelutils
network
rdft
"
# COMPONENT_LIST needs to come last to ensure correct dependency checking
@ -2766,7 +2758,6 @@ cbs_h266_select="cbs"
cbs_jpeg_select="cbs"
cbs_mpeg2_select="cbs"
cbs_vp9_select="cbs"
dct_select="rdft"
deflate_wrapper_deps="zlib"
dirac_parse_select="golomb"
dovi_rpu_select="golomb"
@ -2786,7 +2777,6 @@ frame_thread_encoder_deps="encoders threads"
inflate_wrapper_deps="zlib"
intrax8_select="blockdsp wmv2dsp"
iso_media_select="mpeg4audio"
mdct_select="fft"
me_cmp_select="idctdsp"
mpeg_er_select="error_resilience"
mpegaudio_select="mpegaudiodsp mpegaudioheader"
@ -2796,7 +2786,6 @@ mpegvideoenc_select="aandcttables fdctdsp me_cmp mpegvideo pixblockdsp"
msmpeg4dec_select="h263_decoder"
msmpeg4enc_select="h263_encoder"
vc1dsp_select="h264chroma qpeldsp startcode"
rdft_select="fft"
# decoders / encoders
aac_decoder_select="adts_header mpeg4audio sinewin"

View File

@ -48,11 +48,6 @@ Files that have MIPS copyright notice in them:
float_dsp_mips.c
libm_mips.h
softfloat_tables.h
* libavcodec/
fft_fixed_32.c
fft_init_table.c
fft_table.h
mdct_fixed_32.c
* libavcodec/mips/
aacdec_fixed.c
aacsbr_fixed.c
@ -70,9 +65,6 @@ Files that have MIPS copyright notice in them:
compute_antialias_float.h
lsp_mips.h
dsputil_mips.c
fft_mips.c
fft_table.h
fft_init_table.c
fmtconvert_mips.c
iirfilter_mips.c
mpegaudiodsp_mips_fixed.c

View File

@ -32,6 +32,7 @@ OBJS = ac3_parser.o \
allcodecs.o \
avcodec.o \
avdct.o \
avfft.o \
avpacket.o \
bitstream.o \
bitstream_filters.o \
@ -81,7 +82,6 @@ OBJS-$(CONFIG_CBS_JPEG) += cbs_jpeg.o
OBJS-$(CONFIG_CBS_MPEG2) += cbs_mpeg2.o
OBJS-$(CONFIG_CBS_VP9) += cbs_vp9.o
OBJS-$(CONFIG_CRYSTALHD) += crystalhd.o
OBJS-$(CONFIG_DCT) += dct.o dct32_fixed.o dct32_float.o
OBJS-$(CONFIG_DEFLATE_WRAPPER) += zlib_wrapper.o
OBJS-$(CONFIG_DOVI_RPU) += dovi_rpu.o
OBJS-$(CONFIG_ERROR_RESILIENCE) += error_resilience.o
@ -90,9 +90,6 @@ OBJS-$(CONFIG_EXIF) += exif.o tiff_common.o
OBJS-$(CONFIG_FAANDCT) += faandct.o
OBJS-$(CONFIG_FAANIDCT) += faanidct.o
OBJS-$(CONFIG_FDCTDSP) += fdctdsp.o jfdctfst.o jfdctint.o
FFT-OBJS-$(CONFIG_HARDCODED_TABLES) += cos_tables.o
OBJS-$(CONFIG_FFT) += avfft.o fft_float.o fft_fixed_32.o \
fft_init_table.o $(FFT-OBJS-yes)
OBJS-$(CONFIG_FMTCONVERT) += fmtconvert.o
OBJS-$(CONFIG_GOLOMB) += golomb.o
OBJS-$(CONFIG_H263DSP) += h263dsp.o
@ -125,7 +122,6 @@ OBJS-$(CONFIG_LLVIDENCDSP) += lossless_videoencdsp.o
OBJS-$(CONFIG_LPC) += lpc.o
OBJS-$(CONFIG_LSP) += lsp.o
OBJS-$(CONFIG_LZF) += lzf.o
OBJS-$(CONFIG_MDCT) += mdct_float.o mdct_fixed_32.o
OBJS-$(CONFIG_ME_CMP) += me_cmp.o
OBJS-$(CONFIG_MEDIACODEC) += mediacodecdec_common.o mediacodec_surface.o mediacodec_wrapper.o mediacodec_sw_buffer.o
OBJS-$(CONFIG_MPEG_ER) += mpeg_er.o
@ -157,7 +153,6 @@ OBJS-$(CONFIG_QSV) += qsv.o
OBJS-$(CONFIG_QSVDEC) += qsvdec.o
OBJS-$(CONFIG_QSVENC) += qsvenc.o
OBJS-$(CONFIG_RANGECODER) += rangecoder.o
OBJS-$(CONFIG_RDFT) += rdft.o
OBJS-$(CONFIG_RV34DSP) += rv34dsp.o
OBJS-$(CONFIG_SINEWIN) += sinewin.o
OBJS-$(CONFIG_SNAPPY) += snappy.o
@ -1326,8 +1321,6 @@ TESTPROGS = avcodec \
TESTPROGS-$(CONFIG_AV1_VAAPI_ENCODER) += av1_levels
TESTPROGS-$(CONFIG_CABAC) += cabac
TESTPROGS-$(CONFIG_DCT) += avfft
TESTPROGS-$(CONFIG_FFT) += fft fft-fixed32
TESTPROGS-$(CONFIG_GOLOMB) += golomb
TESTPROGS-$(CONFIG_IDCTDSP) += dct
TESTPROGS-$(CONFIG_IIRFILTER) += iirfilter
@ -1347,7 +1340,6 @@ HOSTPROGS = aacps_tablegen \
aacps_fixed_tablegen \
cbrt_tablegen \
cbrt_fixed_tablegen \
cos_tablegen \
dv_tablegen \
motionpixels_tablegen \
mpegaudio_tablegen \
@ -1362,12 +1354,6 @@ CLEANFILES = *_tables.c *_tables.h *_tablegen$(HOSTEXESUF)
$(SUBDIR)tests/dct$(EXESUF): $(SUBDIR)dctref.o $(SUBDIR)aandcttab.o
$(SUBDIR)dv_tablegen$(HOSTEXESUF): $(SUBDIR)dvdata_host.o
TRIG_TABLES = cos cos_fixed sin
TRIG_TABLES := $(TRIG_TABLES:%=$(SUBDIR)%_tables.c)
$(TRIG_TABLES): $(SUBDIR)%_tables.c: $(SUBDIR)cos_tablegen$(HOSTEXESUF)
$(M)./$< $* > $@
ifdef CONFIG_SMALL
$(SUBDIR)%_tablegen$(HOSTEXESUF): HOSTCFLAGS += -DCONFIG_SMALL=1
else

View File

@ -1,5 +1,4 @@
# subsystems
OBJS-$(CONFIG_FFT) += aarch64/fft_init_aarch64.o
OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_init.o
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
@ -36,7 +35,6 @@ ARMV8-OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp.o
# subsystems
NEON-OBJS-$(CONFIG_AAC_DECODER) += aarch64/sbrdsp_neon.o
NEON-OBJS-$(CONFIG_FFT) += aarch64/fft_neon.o
NEON-OBJS-$(CONFIG_FMTCONVERT) += aarch64/fmtconvert_neon.o
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_neon.o \
@ -47,7 +45,6 @@ NEON-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_neon.o \
NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o
NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \
aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o
NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o

View File

@ -1,25 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_AARCH64_ASM_OFFSETS_H
#define AVCODEC_AARCH64_ASM_OFFSETS_H
/* FFTContext */
#define IMDCT_HALF 0x48
#endif /* AVCODEC_AARCH64_ASM_OFFSETS_H */

View File

@ -1,52 +0,0 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/aarch64/cpu.h"
#include "libavcodec/fft.h"
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
av_cold void ff_fft_init_aarch64(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
if (s->nbits < 17) {
s->fft_permute = ff_fft_permute_neon;
s->fft_calc = ff_fft_calc_neon;
}
#if CONFIG_MDCT
s->imdct_calc = ff_imdct_calc_neon;
s->imdct_half = ff_imdct_half_neon;
s->mdct_calc = ff_mdct_calc_neon;
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
#endif
}
}

View File

@ -1,447 +0,0 @@
/*
* ARM NEON optimised FFT
*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2009 Naotoshi Nojiri
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
#define M_SQRT1_2 0.70710678118654752440
.macro transpose d0, d1, s0, s1
trn1 \d0, \s0, \s1
trn2 \d1, \s0, \s1
.endm
function fft4_neon
AARCH64_VALID_JUMP_TARGET
ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1
fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1
ext v16.8b, v2.8b, v3.8b, #4
ext v17.8b, v3.8b, v2.8b, #4
fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3
fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3
fadd v0.2s, v4.2s, v5.2s
fsub v2.2s, v4.2s, v5.2s
fadd v1.2s, v6.2s, v7.2s
fsub v3.2s, v6.2s, v7.2s
st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
ret
endfunc
function fft8_neon
AARCH64_VALID_JUMP_TARGET
mov x1, x0
ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
ext v22.8b, v2.8b, v3.8b, #4
ext v23.8b, v3.8b, v2.8b, #4
fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
rev64 v27.2s, v28.2s // ???
fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
ext v6.8b, v4.8b, v5.8b, #4
ext v7.8b, v5.8b, v4.8b, #4
fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
fadd v0.2s, v20.2s, v21.2s
fsub v2.2s, v20.2s, v21.2s
fadd v1.2s, v22.2s, v23.2s
rev64 v26.2s, v26.2s
rev64 v27.2s, v27.2s
fsub v3.2s, v22.2s, v23.2s
fsub v6.2s, v6.2s, v7.2s
fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
fadd v7.2s, v4.2s, v5.2s
fsub v18.2s, v2.2s, v6.2s
ext v26.8b, v24.8b, v25.8b, #4
ext v27.8b, v25.8b, v24.8b, #4
fadd v2.2s, v2.2s, v6.2s
fsub v16.2s, v0.2s, v7.2s
fadd v5.2s, v25.2s, v24.2s
fsub v4.2s, v26.2s, v27.2s
fadd v0.2s, v0.2s, v7.2s
fsub v17.2s, v1.2s, v5.2s
fsub v19.2s, v3.2s, v4.2s
fadd v3.2s, v3.2s, v4.2s
fadd v1.2s, v1.2s, v5.2s
st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1]
ret
endfunc
function fft16_neon
AARCH64_VALID_JUMP_TARGET
mov x1, x0
ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
ext v22.8b, v2.8b, v3.8b, #4
ext v23.8b, v3.8b, v2.8b, #4
fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
rev64 v27.2s, v28.2s // ???
fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
ext v6.8b, v4.8b, v5.8b, #4
ext v7.8b, v5.8b, v4.8b, #4
fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
fadd v0.2s, v20.2s, v21.2s
fsub v2.2s, v20.2s, v21.2s
fadd v1.2s, v22.2s, v23.2s
rev64 v26.2s, v26.2s
rev64 v27.2s, v27.2s
fsub v3.2s, v22.2s, v23.2s
fsub v6.2s, v6.2s, v7.2s
fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
fadd v7.2s, v4.2s, v5.2s
fsub v18.2s, v2.2s, v6.2s
ld1 {v20.4s,v21.4s}, [x0], #32
ld1 {v22.4s,v23.4s}, [x0], #32
ext v26.8b, v24.8b, v25.8b, #4
ext v27.8b, v25.8b, v24.8b, #4
fadd v2.2s, v2.2s, v6.2s
fsub v16.2s, v0.2s, v7.2s
fadd v5.2s, v25.2s, v24.2s
fsub v4.2s, v26.2s, v27.2s
transpose v24.2d, v25.2d, v20.2d, v22.2d
transpose v26.2d, v27.2d, v21.2d, v23.2d
fadd v0.2s, v0.2s, v7.2s
fsub v17.2s, v1.2s, v5.2s
fsub v19.2s, v3.2s, v4.2s
fadd v3.2s, v3.2s, v4.2s
fadd v1.2s, v1.2s, v5.2s
ext v20.16b, v21.16b, v21.16b, #4
ext v21.16b, v23.16b, v23.16b, #4
zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]}
zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]}
zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]}
zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]}
// 2 x fft4
transpose v22.2d, v23.2d, v20.2d, v21.2d
fadd v4.4s, v24.4s, v25.4s
fadd v5.4s, v26.4s, v27.4s
fsub v6.4s, v24.4s, v25.4s
fsub v7.4s, v22.4s, v23.4s
ld1 {v23.4s}, [x14]
fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]}
fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]}
fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]}
fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]}
//fft_pass_neon_16
rev64 v7.4s, v25.4s
fmul v25.4s, v25.4s, v23.s[1]
fmul v7.4s, v7.4s, v29.4s
fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a}
zip1 v20.4s, v24.4s, v25.4s
zip2 v21.4s, v24.4s, v25.4s
fneg v22.4s, v20.4s
fadd v4.4s, v21.4s, v20.4s
fsub v6.4s, v20.4s, v21.4s // just the second half
fadd v5.4s, v21.4s, v22.4s // just the first half
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]}
fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]}
fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]}
fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]}
//second half
rev64 v6.4s, v26.4s
fmul v26.4s, v26.4s, v23.s[2]
rev64 v7.4s, v27.4s
fmul v27.4s, v27.4s, v23.s[3]
fmul v6.4s, v6.4s, v29.4s
fmul v7.4s, v7.4s, v29.4s
fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6}
fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a}
zip1 v24.4s, v26.4s, v27.4s
zip2 v25.4s, v26.4s, v27.4s
fneg v26.4s, v24.4s
fadd v4.4s, v25.4s, v24.4s
fsub v6.4s, v24.4s, v25.4s // just the second half
fadd v5.4s, v25.4s, v26.4s // just the first half
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]}
fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]}
fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]}
fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]}
st1 {v16.4s,v17.4s}, [x1], #32
st1 {v18.4s,v19.4s}, [x1], #32
st1 {v20.4s,v21.4s}, [x1], #32
st1 {v22.4s,v23.4s}, [x1], #32
ret
endfunc
const trans4_float, align=4
.byte 0, 1, 2, 3
.byte 8, 9, 10, 11
.byte 4, 5, 6, 7
.byte 12, 13, 14, 15
endconst
const trans8_float, align=4
.byte 24, 25, 26, 27
.byte 0, 1, 2, 3
.byte 28, 29, 30, 31
.byte 4, 5, 6, 7
endconst
function fft_pass_neon
sub x6, x2, #1 // n - 1, loop counter
lsl x5, x2, #3 // 2 * n * sizeof FFTSample
lsl x1, x2, #4 // 2 * n * sizeof FFTComplex
add x5, x4, x5 // wim
add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex
add x2, x0, x2, lsl #5 // &z[o2]
add x3, x0, x3 // &z[o3]
add x1, x0, x1 // &z[o1]
ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
trn2 v25.2d, v20.2d, v22.2d
sub x5, x5, #4 // wim--
trn1 v24.2d, v20.2d, v22.2d
ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1]
rev64 v7.4s, v25.4s
fmul v25.4s, v25.4s, v4.s[1]
ld1 {v16.4s}, [x0] // {z[0],z[1]}
fmul v7.4s, v7.4s, v29.4s
ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]}
prfm pldl1keep, [x2, #16]
prfm pldl1keep, [x3, #16]
fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
prfm pldl1keep, [x0, #16]
prfm pldl1keep, [x1, #16]
zip1 v20.4s, v24.4s, v25.4s
zip2 v21.4s, v24.4s, v25.4s
fneg v22.4s, v20.4s
fadd v4.4s, v21.4s, v20.4s
fsub v6.4s, v20.4s, v21.4s // just the second half
fadd v5.4s, v21.4s, v22.4s // just the first half
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
fadd v20.4s, v16.4s, v4.4s
fsub v22.4s, v16.4s, v4.4s
fadd v21.4s, v17.4s, v5.4s
st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
fsub v23.4s, v17.4s, v5.4s
st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
1:
ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
transpose v26.2d, v27.2d, v20.2d, v22.2d
ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]}
rev64 v6.4s, v26.4s
fmul v26.4s, v26.4s, v4.s[0]
rev64 v7.4s, v27.4s
fmul v27.4s, v27.4s, v4.s[1]
fmul v6.4s, v6.4s, v29.4s
fmul v7.4s, v7.4s, v29.4s
ld1 {v16.4s},[x0] // {z[0],z[1]}
fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6}
fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
ld1 {v17.4s},[x1] // {z[o1],z[o1+1]}
subs x6, x6, #1 // n--
zip1 v20.4s, v26.4s, v27.4s
zip2 v21.4s, v26.4s, v27.4s
fneg v22.4s, v20.4s
fadd v4.4s, v21.4s, v20.4s
fsub v6.4s, v20.4s, v21.4s // just the second half
fadd v5.4s, v21.4s, v22.4s // just the first half
tbl v4.16b, {v4.16b}, v30.16b // trans4_float
tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
fadd v20.4s, v16.4s, v4.4s
fsub v22.4s, v16.4s, v4.4s
fadd v21.4s, v17.4s, v5.4s
st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
fsub v23.4s, v17.4s, v5.4s
st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
b.ne 1b
ret
endfunc
.macro def_fft n, n2, n4
function fft\n\()_neon, align=6
AARCH64_VALID_JUMP_TARGET
AARCH64_SIGN_LINK_REGISTER
stp x28, x30, [sp, #-16]!
add x28, x0, #\n4*2*8
bl fft\n2\()_neon
mov x0, x28
bl fft\n4\()_neon
add x0, x28, #\n4*1*8
bl fft\n4\()_neon
sub x0, x28, #\n4*2*8
ldp x28, x30, [sp], #16
AARCH64_VALIDATE_LINK_REGISTER
movrel x4, X(ff_cos_\n)
mov x2, #\n4>>1
b fft_pass_neon
endfunc
.endm
def_fft 32, 16, 8
def_fft 64, 32, 16
def_fft 128, 64, 32
def_fft 256, 128, 64
def_fft 512, 256, 128
def_fft 1024, 512, 256
def_fft 2048, 1024, 512
def_fft 4096, 2048, 1024
def_fft 8192, 4096, 2048
def_fft 16384, 8192, 4096
def_fft 32768, 16384, 8192
def_fft 65536, 32768, 16384
function ff_fft_calc_neon, export=1
prfm pldl1keep, [x1]
movrel x10, trans4_float
ldr w2, [x0]
movrel x11, trans8_float
sub w2, w2, #2
movrel x3, fft_tab_neon
ld1 {v30.16b}, [x10]
mov x7, #-8
movrel x12, pmmp
ldr x3, [x3, x2, lsl #3]
movrel x13, mppm
movrel x14, X(ff_cos_16)
ld1 {v31.16b}, [x11]
mov x0, x1
ld1 {v29.4s}, [x12] // pmmp
ld1 {v28.4s}, [x13]
br x3
endfunc
function ff_fft_permute_neon, export=1
mov x6, #1
ldr w2, [x0] // nbits
ldr x3, [x0, #16] // tmp_buf
ldr x0, [x0, #8] // revtab
lsl x6, x6, x2
mov x2, x6
1:
ld1 {v0.2s,v1.2s}, [x1], #16
ldr w4, [x0], #4
uxth w5, w4
lsr w4, w4, #16
add x5, x3, x5, lsl #3
add x4, x3, x4, lsl #3
st1 {v0.2s}, [x5]
st1 {v1.2s}, [x4]
subs x6, x6, #2
b.gt 1b
sub x1, x1, x2, lsl #3
1:
ld1 {v0.4s,v1.4s}, [x3], #32
st1 {v0.4s,v1.4s}, [x1], #32
subs x2, x2, #4
b.gt 1b
ret
endfunc
const fft_tab_neon, relocate=1
.quad fft4_neon
.quad fft8_neon
.quad fft16_neon
.quad fft32_neon
.quad fft64_neon
.quad fft128_neon
.quad fft256_neon
.quad fft512_neon
.quad fft1024_neon
.quad fft2048_neon
.quad fft4096_neon
.quad fft8192_neon
.quad fft16384_neon
.quad fft32768_neon
.quad fft65536_neon
endconst
const pmmp, align=4
.float +1.0, -1.0, -1.0, +1.0
endconst
const mppm, align=4
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
endconst

View File

@ -1,326 +0,0 @@
/*
* AArch64 NEON optimised MDCT
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/aarch64/asm.S"
function ff_imdct_half_neon, export=1
stp x19, x20, [sp, #-32]!
AARCH64_SIGN_LINK_REGISTER
str x30, [sp, #16]
mov x12, #1
ldr w14, [x0, #28] // mdct_bits
ldr x4, [x0, #32] // tcos
ldr x3, [x0, #8] // revtab
lsl x12, x12, x14 // n = 1 << nbits
lsr x14, x12, #2 // n4 = n >> 2
add x7, x2, x12, lsl #1
mov x12, #-16
sub x7, x7, #16
ld2 {v16.2s,v17.2s}, [x7], x12 // d16=x,n1 d17=x,n0
ld2 {v0.2s,v1.2s}, [x2], #16 // d0 =m0,x d1 =m1,x
rev64 v17.2s, v17.2s
ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
fmul v6.2s, v17.2s, v2.2s
fmul v7.2s, v0.2s, v2.2s
1:
subs x14, x14, #2
ldr w6, [x3], #4
fmul v4.2s, v0.2s, v3.2s
fmul v5.2s, v17.2s, v3.2s
fsub v4.2s, v6.2s, v4.2s
fadd v5.2s, v5.2s, v7.2s
ubfm x8, x6, #16, #31
ubfm x6, x6, #0, #15
add x8, x1, x8, lsl #3
add x6, x1, x6, lsl #3
b.eq 2f
ld2 {v16.2s,v17.2s}, [x7], x12
ld2 {v0.2s,v1.2s}, [x2], #16
rev64 v17.2s, v17.2s
ld2 {v2.2s,v3.2s}, [x4], #16 // d2=c0,c1 d3=s0,s2
fmul v6.2s, v17.2s, v2.2s
fmul v7.2s, v0.2s, v2.2s
st2 {v4.s,v5.s}[0], [x6]
st2 {v4.s,v5.s}[1], [x8]
b 1b
2:
st2 {v4.s,v5.s}[0], [x6]
st2 {v4.s,v5.s}[1], [x8]
mov x19, x0
mov x20, x1
bl X(ff_fft_calc_neon)
mov x12, #1
ldr w14, [x19, #28] // mdct_bits
ldr x4, [x19, #32] // tcos
lsl x12, x12, x14 // n = 1 << nbits
lsr x14, x12, #3 // n8 = n >> 3
add x4, x4, x14, lsl #3
add x6, x20, x14, lsl #3
sub x1, x4, #16
sub x3, x6, #16
mov x7, #-16
mov x8, x6
mov x0, x3
ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =i1,r1 d1 =i0,r0
ld2 {v20.2s,v21.2s},[x6], #16 // d20=i2,r2 d21=i3,r3
ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
3:
subs x14, x14, #2
fmul v7.2s, v0.2s, v17.2s
ld2 {v18.2s,v19.2s},[x4], #16 // d17=c2,c3 d19=s2,s3
fmul v4.2s, v1.2s, v17.2s
fmul v6.2s, v21.2s, v19.2s
fmul v5.2s, v20.2s, v19.2s
fmul v22.2s, v1.2s, v16.2s
fmul v23.2s, v21.2s, v18.2s
fmul v24.2s, v0.2s, v16.2s
fmul v25.2s, v20.2s, v18.2s
fadd v7.2s, v7.2s, v22.2s
fadd v5.2s, v5.2s, v23.2s
fsub v4.2s, v4.2s, v24.2s
fsub v6.2s, v6.2s, v25.2s
b.eq 4f
ld2 {v0.2s,v1.2s}, [x3], x7
ld2 {v20.2s,v21.2s},[x6], #16
ld2 {v16.2s,v17.2s},[x1], x7 // d16=c1,c0 d18=s1,s0
rev64 v5.2s, v5.2s
rev64 v7.2s, v7.2s
st2 {v4.2s,v5.2s}, [x0], x7
st2 {v6.2s,v7.2s}, [x8], #16
b 3b
4:
rev64 v5.2s, v5.2s
rev64 v7.2s, v7.2s
st2 {v4.2s,v5.2s}, [x0]
st2 {v6.2s,v7.2s}, [x8]
ldr x30, [sp, #16]
AARCH64_VALIDATE_LINK_REGISTER
ldp x19, x20, [sp], #32
ret
endfunc
function ff_imdct_calc_neon, export=1
stp x19, x20, [sp, #-32]!
AARCH64_SIGN_LINK_REGISTER
str x30, [sp, #16]
ldr w3, [x0, #28] // mdct_bits
mov x19, #1
mov x20, x1
lsl x19, x19, x3
add x1, x1, x19
bl X(ff_imdct_half_neon)
add x0, x20, x19, lsl #2
add x1, x20, x19, lsl #1
sub x0, x0, #8
sub x2, x1, #16
mov x3, #-16
mov x6, #-8
1:
ld1 {v0.4s}, [x2], x3
prfum pldl1keep, [x0, #-16]
rev64 v0.4s, v0.4s
ld1 {v2.2s,v3.2s}, [x1], #16
fneg v4.4s, v0.4s
prfum pldl1keep, [x2, #-16]
rev64 v2.2s, v2.2s
rev64 v3.2s, v3.2s
ext v4.16b, v4.16b, v4.16b, #8
st1 {v2.2s}, [x0], x6
st1 {v3.2s}, [x0], x6
st1 {v4.4s}, [x20], #16
subs x19, x19, #16
b.gt 1b
ldr x30, [sp, #16]
AARCH64_VALIDATE_LINK_REGISTER
ldp x19, x20, [sp], #32
ret
endfunc
function ff_mdct_calc_neon, export=1
stp x19, x20, [sp, #-32]!
AARCH64_SIGN_LINK_REGISTER
str x30, [sp, #16]
mov x12, #1
ldr w14, [x0, #28] // mdct_bits
ldr x4, [x0, #32] // tcos
ldr x3, [x0, #8] // revtab
lsl x14, x12, x14 // n = 1 << nbits
add x7, x2, x14 // in4u
sub x9, x7, #16 // in4d
add x2, x7, x14, lsl #1 // in3u
add x8, x9, x14, lsl #1 // in3d
add x5, x4, x14, lsl #1
sub x5, x5, #16
sub x3, x3, #4
mov x12, #-16
lsr x13, x14, #1
ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
1:
fmul v7.2s, v0.2s, v21.2s // I*s
ldr w10, [x3, x13]
fmul v6.2s, v2.2s, v20.2s // -R*c
ldr w6, [x3, #4]!
fmul v4.2s, v2.2s, v21.2s // -R*s
fmul v5.2s, v0.2s, v20.2s // I*c
fmul v24.2s, v16.2s, v30.2s // R*c
fmul v25.2s, v18.2s, v31.2s // -I*s
fmul v22.2s, v16.2s, v31.2s // R*s
fmul v23.2s, v18.2s, v30.2s // I*c
subs x14, x14, #16
subs x13, x13, #8
fsub v6.2s, v6.2s, v7.2s // -R*c-I*s
fadd v7.2s, v4.2s, v5.2s // -R*s+I*c
fsub v24.2s, v25.2s, v24.2s // I*s-R*c
fadd v25.2s, v22.2s, v23.2s // R*s-I*c
b.eq 1f
mov x12, #-16
ld2 {v16.2s,v17.2s}, [x9], x12 // in0u0,in0u1 in4d1,in4d0
ld2 {v18.2s,v19.2s}, [x8], x12 // in2u0,in2u1 in3d1,in3d0
fneg v7.2s, v7.2s // R*s-I*c
ld2 {v0.2s, v1.2s}, [x7], #16 // in4u0,in4u1 in2d1,in2d0
rev64 v17.2s, v17.2s // in4d0,in4d1 in3d0,in3d1
rev64 v19.2s, v19.2s // in4d0,in4d1 in3d0,in3d1
ld2 {v2.2s, v3.2s}, [x2], #16 // in3u0,in3u1 in1d1,in1d0
fsub v0.2s, v17.2s, v0.2s // in4d-in4u I
ld2 {v20.2s,v21.2s}, [x4], #16 // c0,c1 s0,s1
rev64 v1.2s, v1.2s // in2d0,in2d1 in1d0,in1d1
rev64 v3.2s, v3.2s // in2d0,in2d1 in1d0,in1d1
ld2 {v30.2s,v31.2s}, [x5], x12 // c2,c3 s2,s3
fadd v2.2s, v2.2s, v19.2s // in3u+in3d -R
fsub v16.2s, v16.2s, v1.2s // in0u-in2d R
fadd v18.2s, v18.2s, v3.2s // in2u+in1d -I
ubfm x12, x6, #16, #31
ubfm x6, x6, #0, #15
add x12, x1, x12, lsl #3
add x6, x1, x6, lsl #3
st2 {v6.s,v7.s}[0], [x6]
st2 {v6.s,v7.s}[1], [x12]
ubfm x6, x10, #16, #31
ubfm x10, x10, #0, #15
add x6 , x1, x6, lsl #3
add x10, x1, x10, lsl #3
st2 {v24.s,v25.s}[0], [x10]
st2 {v24.s,v25.s}[1], [x6]
b 1b
1:
fneg v7.2s, v7.2s // R*s-I*c
ubfm x12, x6, #16, #31
ubfm x6, x6, #0, #15
add x12, x1, x12, lsl #3
add x6, x1, x6, lsl #3
st2 {v6.s,v7.s}[0], [x6]
st2 {v6.s,v7.s}[1], [x12]
ubfm x6, x10, #16, #31
ubfm x10, x10, #0, #15
add x6 , x1, x6, lsl #3
add x10, x1, x10, lsl #3
st2 {v24.s,v25.s}[0], [x10]
st2 {v24.s,v25.s}[1], [x6]
mov x19, x0
mov x20, x1
bl X(ff_fft_calc_neon)
mov x12, #1
ldr w14, [x19, #28] // mdct_bits
ldr x4, [x19, #32] // tcos
lsl x12, x12, x14 // n = 1 << nbits
lsr x14, x12, #3 // n8 = n >> 3
add x4, x4, x14, lsl #3
add x6, x20, x14, lsl #3
sub x1, x4, #16
sub x3, x6, #16
mov x7, #-16
mov x8, x6
mov x0, x3
ld2 {v0.2s,v1.2s}, [x3], x7 // d0 =r1,i1 d1 =r0,i0
ld2 {v20.2s,v21.2s}, [x6], #16 // d20=r2,i2 d21=r3,i3
ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
1:
subs x14, x14, #2
fmul v7.2s, v0.2s, v17.2s // r1*s1,r0*s0
ld2 {v18.2s,v19.2s}, [x4], #16 // c2,c3 s2,s3
fmul v4.2s, v1.2s, v17.2s // i1*s1,i0*s0
fmul v6.2s, v21.2s, v19.2s // i2*s2,i3*s3
fmul v5.2s, v20.2s, v19.2s // r2*s2,r3*s3
fmul v24.2s, v0.2s, v16.2s // r1*c1,r0*c0
fmul v25.2s, v20.2s, v18.2s // r2*c2,r3*c3
fmul v22.2s, v21.2s, v18.2s // i2*c2,i3*c3
fmul v23.2s, v1.2s, v16.2s // i1*c1,i0*c0
fadd v4.2s, v4.2s, v24.2s // i1*s1+r1*c1,i0*s0+r0*c0
fadd v6.2s, v6.2s, v25.2s // i2*s2+r2*c2,i3*s3+r3*c3
fsub v5.2s, v22.2s, v5.2s // i2*c2-r2*s2,i3*c3-r3*s3
fsub v7.2s, v23.2s, v7.2s // i1*c1-r1*s1,i0*c0-r0*s0
fneg v4.2s, v4.2s
fneg v6.2s, v6.2s
b.eq 1f
ld2 {v0.2s, v1.2s}, [x3], x7
ld2 {v20.2s,v21.2s}, [x6], #16
ld2 {v16.2s,v17.2s}, [x1], x7 // c1,c0 s1,s0
rev64 v5.2s, v5.2s
rev64 v7.2s, v7.2s
st2 {v4.2s,v5.2s}, [x0], x7
st2 {v6.2s,v7.2s}, [x8], #16
b 1b
1:
rev64 v5.2s, v5.2s
rev64 v7.2s, v7.2s
st2 {v4.2s,v5.2s}, [x0]
st2 {v6.2s,v7.2s}, [x8]
ldr x30, [sp, #16]
AARCH64_VALIDATE_LINK_REGISTER
ldp x19, x20, [sp], #32
ret
endfunc

View File

@ -23,15 +23,8 @@
#include "libavutil/aarch64/cpu.h"
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
#include "libavcodec/fft.h"
#include "libavcodec/synth_filter.h"
#include "asm-offsets.h"
#if HAVE_NEON
AV_CHECK_OFFSET(FFTContext, imdct_half, IMDCT_HALF);
#endif
void ff_synth_filter_float_neon(AVTXContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
float synth_buf2[32], const float window[512],

View File

@ -19,8 +19,6 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "asm-offsets.h"
#include "libavutil/aarch64/asm.S"
.macro inner_loop

View File

@ -5,7 +5,6 @@ OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \
arm/ac3dsp_arm.o
OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_arm.o
OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_arm.o
OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o
OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_init_arm.o
OBJS-$(CONFIG_G722DSP) += arm/g722dsp_init_arm.o
OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o
@ -25,7 +24,6 @@ OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_init_arm.o
OBJS-$(CONFIG_RDFT) += arm/rdft_init_arm.o
OBJS-$(CONFIG_RV34DSP) += arm/rv34dsp_init_arm.o
OBJS-$(CONFIG_VC1DSP) += arm/vc1dsp_init_arm.o
OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o
@ -90,9 +88,7 @@ ARMV6-OBJS-$(CONFIG_TRUEHD_DECODER) += arm/mlpdsp_armv6.o
# VFP optimizations
# subsystems
VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
VFP-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_vfp.o
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
# decoders/encoders
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
@ -107,7 +103,6 @@ NEON-OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_neon.o \
arm/int_neon.o
NEON-OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_neon.o \
arm/blockdsp_neon.o
NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o
NEON-OBJS-$(CONFIG_FMTCONVERT) += arm/fmtconvert_neon.o
NEON-OBJS-$(CONFIG_G722DSP) += arm/g722dsp_neon.o
NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o
@ -121,10 +116,8 @@ NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \
NEON-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_neon.o \
arm/idctdsp_neon.o \
arm/simple_idct_neon.o
NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o
NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += arm/pixblockdsp_neon.o
NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o
NEON-OBJS-$(CONFIG_VC1DSP) += arm/vc1dsp_init_neon.o \
arm/vc1dsp_neon.o
NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o

View File

@ -1,63 +0,0 @@
/*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/fft.h"
void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
av_cold void ff_fft_init_arm(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_vfp_vm(cpu_flags)) {
s->fft_calc = ff_fft_calc_vfp;
#if CONFIG_MDCT
s->imdct_half = ff_imdct_half_vfp;
#endif
}
if (have_neon(cpu_flags)) {
#if CONFIG_FFT
if (s->nbits < 17) {
s->fft_permute = ff_fft_permute_neon;
s->fft_calc = ff_fft_calc_neon;
}
#endif
#if CONFIG_MDCT
s->imdct_calc = ff_imdct_calc_neon;
s->imdct_half = ff_imdct_half_neon;
s->mdct_calc = ff_mdct_calc_neon;
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
#endif
}
}

View File

@ -1,375 +0,0 @@
/*
* ARM NEON optimised FFT
*
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
* Copyright (c) 2009 Naotoshi Nojiri
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#define M_SQRT1_2 0.70710678118654752440
function fft4_neon
vld1.32 {d0-d3}, [r0,:128]
vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2
vsub.f32 d6, d0, d1 @ r0-r1,i0-i1
vsub.f32 d7, d16, d17 @ r3-r2,i2-i3
vadd.f32 d4, d0, d1 @ r0+r1,i0+i1
vadd.f32 d5, d2, d3 @ i2+i3,r2+r3
vadd.f32 d1, d6, d7
vsub.f32 d3, d6, d7
vadd.f32 d0, d4, d5
vsub.f32 d2, d4, d5
vst1.32 {d0-d3}, [r0,:128]
bx lr
endfunc
function fft8_neon
mov r1, r0
vld1.32 {d0-d3}, [r1,:128]!
vld1.32 {d16-d19}, [r1,:128]
movw r2, #0x04f3 @ sqrt(1/2)
movt r2, #0x3f35
eor r3, r2, #1<<31
vdup.32 d31, r2
vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2
vadd.f32 d4, d16, d17 @ r4+r5,i4+i5
vmov d28, r3, r2
vadd.f32 d5, d18, d19 @ r6+r7,i6+i7
vsub.f32 d17, d16, d17 @ r4-r5,i4-i5
vsub.f32 d19, d18, d19 @ r6-r7,i6-i7
vrev64.32 d29, d28
vadd.f32 d20, d0, d1 @ r0+r1,i0+i1
vadd.f32 d21, d2, d3 @ r2+r3,i2+i3
vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w
vext.32 q3, q2, q2, #1
vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w
vsub.f32 d23, d22, d23 @ i2-i3,r3-r2
vsub.f32 d22, d0, d1 @ r0-r1,i0-i1
vmul.f32 d24, d17, d31 @ a2r*w,a2i*w
vmul.f32 d25, d19, d31 @ a3r*w,a3i*w
vadd.f32 d0, d20, d21
vsub.f32 d2, d20, d21
vadd.f32 d1, d22, d23
vrev64.32 q13, q13
vsub.f32 d3, d22, d23
vsub.f32 d6, d6, d7
vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2
vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6
vadd.f32 d7, d4, d5
vsub.f32 d18, d2, d6
vext.32 q13, q12, q12, #1
vadd.f32 d2, d2, d6
vsub.f32 d16, d0, d7
vadd.f32 d5, d25, d24
vsub.f32 d4, d26, d27
vadd.f32 d0, d0, d7
vsub.f32 d17, d1, d5
vsub.f32 d19, d3, d4
vadd.f32 d3, d3, d4
vadd.f32 d1, d1, d5
vst1.32 {d16-d19}, [r1,:128]
vst1.32 {d0-d3}, [r0,:128]
bx lr
endfunc
function fft16_neon
movrel r1, mppm
vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
pld [r0, #32]
vld1.32 {d2-d3}, [r1,:128]
vext.32 q13, q9, q9, #1
vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
vadd.f32 d4, d16, d17
vsub.f32 d5, d16, d17
vadd.f32 d18, d18, d19
vsub.f32 d19, d26, d27
vadd.f32 d20, d22, d23
vsub.f32 d22, d22, d23
vsub.f32 d23, d24, d25
vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1}
vadd.f32 d21, d24, d25
vmul.f32 d24, d22, d2
vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3}
vmul.f32 d25, d23, d3
vuzp.32 d16, d17 @ {r0,r1,i0,i1}
vmul.f32 q1, q11, d2[1]
vuzp.32 d18, d19 @ {r2,r3,i2,i3}
vrev64.32 q12, q12
vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6}
vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
vzip.32 q10, q11
vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
vadd.f32 d0, d22, d20
vadd.f32 d1, d21, d23
vsub.f32 d2, d21, d23
vsub.f32 d3, d22, d20
sub r0, r0, #96
vext.32 q13, q13, q13, #1
vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5}
vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
vext.32 q15, q15, q15, #1
vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7}
vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3}
vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6}
vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a}
movrelx r2, X(ff_cos_16)
vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8}
vrev64.32 d1, d1
vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a}
vrev64.32 d3, d3
movrel r3, pmmp
vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9}
vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13}
vld1.32 {d4-d5}, [r2,:64]
vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11}
vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15}
vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
vld1.32 {d6-d7}, [r3,:128]
vrev64.32 q1, q14
vmul.f32 q14, q14, d4[1]
vmul.f32 q1, q1, q3
vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a}
vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
vzip.32 q12, q14
vadd.f32 d0, d28, d24
vadd.f32 d1, d25, d29
vsub.f32 d2, d25, d29
vsub.f32 d3, d28, d24
vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9}
vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13}
mov r1, #32
vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5}
vrev64.32 q0, q13
vmul.f32 q13, q13, d5[0]
vrev64.32 q1, q15
vmul.f32 q15, q15, d5[1]
vst2.32 {d16-d17},[r0,:128], r1
vmul.f32 q0, q0, q3
vst2.32 {d20-d21},[r0,:128], r1
vmul.f32 q1, q1, q3
vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6}
vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a}
vst2.32 {d24-d25},[r0,:128], r1
vst2.32 {d28-d29},[r0,:128]
vzip.32 q13, q15
sub r0, r0, #80
vadd.f32 d0, d30, d26
vadd.f32 d1, d27, d31
vsub.f32 d2, d27, d31
vsub.f32 d3, d30, d26
vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11}
vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3}
vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15}
vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7}
vst2.32 {d18-d19},[r0,:128], r1
vst2.32 {d22-d23},[r0,:128], r1
vst2.32 {d26-d27},[r0,:128], r1
vst2.32 {d30-d31},[r0,:128]
bx lr
endfunc
function fft_pass_neon
push {r4-r6,lr}
mov r6, r2 @ n
lsl r5, r2, #3 @ 2 * n * sizeof FFTSample
lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex
lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex
add r3, r2, r4
add r4, r4, r0 @ &z[o1]
add r2, r2, r0 @ &z[o2]
add r3, r3, r0 @ &z[o3]
vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
movrel r12, pmmp
vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
add r5, r5, r1 @ wim
vld1.32 {d6-d7}, [r12,:128] @ pmmp
vswp d21, d22
vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]}
sub r5, r5, #4 @ wim--
vrev64.32 q1, q11
vmul.f32 q11, q11, d4[1]
vmul.f32 q1, q1, q3
vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1]
vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
sub r6, r6, #1 @ n--
vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
vzip.32 q10, q11
vadd.f32 d0, d22, d20
vadd.f32 d1, d21, d23
vsub.f32 d2, d21, d23
vsub.f32 d3, d22, d20
vsub.f32 q10, q8, q0
vadd.f32 q8, q8, q0
vsub.f32 q11, q9, q1
vadd.f32 q9, q9, q1
vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]}
vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]}
vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]}
vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]}
sub r5, r5, #8 @ wim -= 2
1:
vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
vswp d21, d22
vld1.32 {d4}, [r1]! @ {wre[0],wre[1]}
vrev64.32 q0, q10
vmul.f32 q10, q10, d4[0]
vrev64.32 q1, q11
vmul.f32 q11, q11, d4[1]
vld1.32 {d5}, [r5] @ {wim[-1],wim[0]}
vmul.f32 q0, q0, q3
sub r5, r5, #8 @ wim -= 2
vmul.f32 q1, q1, q3
vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6}
vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
subs r6, r6, #1 @ n--
vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
vzip.32 q10, q11
vadd.f32 d0, d22, d20
vadd.f32 d1, d21, d23
vsub.f32 d2, d21, d23
vsub.f32 d3, d22, d20
vsub.f32 q10, q8, q0
vadd.f32 q8, q8, q0
vsub.f32 q11, q9, q1
vadd.f32 q9, q9, q1
vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]}
vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]}
vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]}
vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]}
bne 1b
pop {r4-r6,pc}
endfunc
.macro def_fft n, n2, n4
.align 6
function fft\n\()_neon
push {r4, lr}
mov r4, r0
bl fft\n2\()_neon
add r0, r4, #\n4*2*8
bl fft\n4\()_neon
add r0, r4, #\n4*3*8
bl fft\n4\()_neon
mov r0, r4
pop {r4, lr}
movrelx r1, X(ff_cos_\n)
mov r2, #\n4/2
b fft_pass_neon
endfunc
.endm
def_fft 32, 16, 8
def_fft 64, 32, 16
def_fft 128, 64, 32
def_fft 256, 128, 64
def_fft 512, 256, 128
def_fft 1024, 512, 256
def_fft 2048, 1024, 512
def_fft 4096, 2048, 1024
def_fft 8192, 4096, 2048
def_fft 16384, 8192, 4096
def_fft 32768, 16384, 8192
def_fft 65536, 32768, 16384
function ff_fft_calc_neon, export=1
ldr r2, [r0]
sub r2, r2, #2
movrel r3, fft_tab_neon
ldr r3, [r3, r2, lsl #2]
mov r0, r1
bx r3
endfunc
function ff_fft_permute_neon, export=1
push {r4,lr}
mov r12, #1
ldr r2, [r0] @ nbits
ldr r3, [r0, #12] @ tmp_buf
ldr r0, [r0, #8] @ revtab
lsl r12, r12, r2
mov r2, r12
1:
vld1.32 {d0-d1}, [r1,:128]!
ldr r4, [r0], #4
uxth lr, r4
uxth r4, r4, ror #16
add lr, r3, lr, lsl #3
add r4, r3, r4, lsl #3
vst1.32 {d0}, [lr,:64]
vst1.32 {d1}, [r4,:64]
subs r12, r12, #2
bgt 1b
sub r1, r1, r2, lsl #3
1:
vld1.32 {d0-d3}, [r3,:128]!
vst1.32 {d0-d3}, [r1,:128]!
subs r2, r2, #4
bgt 1b
pop {r4,pc}
endfunc
const fft_tab_neon, relocate=1
.word fft4_neon
.word fft8_neon
.word fft16_neon
.word fft32_neon
.word fft64_neon
.word fft128_neon
.word fft256_neon
.word fft512_neon
.word fft1024_neon
.word fft2048_neon
.word fft4096_neon
.word fft8192_neon
.word fft16384_neon
.word fft32768_neon
.word fft65536_neon
endconst
const pmmp, align=4
.float +1.0, -1.0, -1.0, +1.0
endconst
const mppm, align=4
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
endconst

View File

@ -1,530 +0,0 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
@ all single-precision VFP registers may be corrupted on exit. The a2
@ register may not be clobbered in these functions, as it holds the
@ stored original FPSCR.
function ff_fft_calc_vfp, export=1
ldr ip, [a1, #0] @ nbits
mov a1, a2
movrel a2, (fft_tab_vfp - 8)
ldr pc, [a2, ip, lsl #2]
endfunc
const fft_tab_vfp, relocate=1
.word fft4_vfp
.word fft8_vfp
.word X(ff_fft16_vfp) @ this one alone is exported
.word fft32_vfp
.word fft64_vfp
.word fft128_vfp
.word fft256_vfp
.word fft512_vfp
.word fft1024_vfp
.word fft2048_vfp
.word fft4096_vfp
.word fft8192_vfp
.word fft16384_vfp
.word fft32768_vfp
.word fft65536_vfp
endconst
function fft4_vfp
vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
@ stall
vadd.f s12, s0, s8 @ i0
vadd.f s13, s1, s9 @ i1
vadd.f s14, s2, s10 @ i2
vadd.f s15, s3, s11 @ i3
vsub.f s8, s0, s8 @ i4
vsub.f s9, s1, s9 @ i5
vsub.f s10, s2, s10 @ i6
vsub.f s11, s3, s11 @ i7
@ stall
@ stall
vadd.f s0, s12, s14 @ z[0].re
vsub.f s4, s12, s14 @ z[2].re
vadd.f s1, s13, s15 @ z[0].im
vsub.f s5, s13, s15 @ z[2].im
vadd.f s7, s9, s10 @ z[3].im
vsub.f s3, s9, s10 @ z[1].im
vadd.f s2, s8, s11 @ z[1].re
vsub.f s6, s8, s11 @ z[3].re
@ stall
@ stall
vstr d0, [a1, #0*2*4]
vstr d2, [a1, #2*2*4]
@ stall
@ stall
vstr d1, [a1, #1*2*4]
vstr d3, [a1, #3*2*4]
bx lr
endfunc
.macro macro_fft8_head
@ FFT4
vldr d4, [a1, #0 * 2*4]
vldr d6, [a1, #1 * 2*4]
vldr d5, [a1, #2 * 2*4]
vldr d7, [a1, #3 * 2*4]
@ BF
vldr d12, [a1, #4 * 2*4]
vadd.f s16, s8, s12 @ vector op
vldr d14, [a1, #5 * 2*4]
vldr d13, [a1, #6 * 2*4]
vldr d15, [a1, #7 * 2*4]
vsub.f s20, s8, s12 @ vector op
vadd.f s0, s16, s18
vsub.f s2, s16, s18
vadd.f s1, s17, s19
vsub.f s3, s17, s19
vadd.f s7, s21, s22
vsub.f s5, s21, s22
vadd.f s4, s20, s23
vsub.f s6, s20, s23
vsub.f s20, s24, s28 @ vector op
vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
vstr d1, [a1, #1 * 2*4]
vldr s0, cos1pi4
vadd.f s16, s24, s28 @ vector op
vstr d2, [a1, #2 * 2*4]
vstr d3, [a1, #3 * 2*4]
vldr d12, [a1, #0 * 2*4]
@ TRANSFORM
vmul.f s20, s20, s0 @ vector x scalar op
vldr d13, [a1, #1 * 2*4]
vldr d14, [a1, #2 * 2*4]
vldr d15, [a1, #3 * 2*4]
@ BUTTERFLIES
vadd.f s0, s18, s16
vadd.f s1, s17, s19
vsub.f s2, s17, s19
vsub.f s3, s18, s16
vadd.f s4, s21, s20
vsub.f s5, s21, s20
vadd.f s6, s22, s23
vsub.f s7, s22, s23
vadd.f s8, s0, s24 @ vector op
vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
vstr d1, [a1, #1 * 2*4]
vldr d6, [a1, #0 * 2*4]
vldr d7, [a1, #1 * 2*4]
vadd.f s1, s5, s6
vadd.f s0, s7, s4
vsub.f s2, s5, s6
vsub.f s3, s7, s4
vsub.f s12, s24, s12 @ vector op
vsub.f s5, s29, s1
vsub.f s4, s28, s0
vsub.f s6, s30, s2
vsub.f s7, s31, s3
vadd.f s16, s0, s28 @ vector op
vstr d6, [a1, #4 * 2*4]
vstr d7, [a1, #6 * 2*4]
vstr d4, [a1, #0 * 2*4]
vstr d5, [a1, #2 * 2*4]
vstr d2, [a1, #5 * 2*4]
vstr d3, [a1, #7 * 2*4]
.endm
.macro macro_fft8_tail
vstr d8, [a1, #1 * 2*4]
vstr d9, [a1, #3 * 2*4]
.endm
function .Lfft8_internal_vfp
macro_fft8_head
macro_fft8_tail
bx lr
endfunc
function fft8_vfp
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
fmrx a2, FPSCR
fmxr FPSCR, a3
vpush {s16-s31}
mov ip, lr
bl .Lfft8_internal_vfp
vpop {s16-s31}
fmxr FPSCR, a2
bx ip
endfunc
.align 3
cos1pi4: @ cos(1*pi/4) = sqrt(2)
.float 0.707106769084930419921875
cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
.float 0.92387950420379638671875
cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
.float 0.3826834261417388916015625
function .Lfft16_internal_vfp
macro_fft8_head
@ FFT4(z+8)
vldr d10, [a1, #8 * 2*4]
vldr d12, [a1, #9 * 2*4]
vldr d11, [a1, #10 * 2*4]
vldr d13, [a1, #11 * 2*4]
macro_fft8_tail
vadd.f s16, s20, s24 @ vector op
@ FFT4(z+12)
vldr d4, [a1, #12 * 2*4]
vldr d6, [a1, #13 * 2*4]
vldr d5, [a1, #14 * 2*4]
vsub.f s20, s20, s24 @ vector op
vldr d7, [a1, #15 * 2*4]
vadd.f s0, s16, s18
vsub.f s4, s16, s18
vadd.f s1, s17, s19
vsub.f s5, s17, s19
vadd.f s7, s21, s22
vsub.f s3, s21, s22
vadd.f s2, s20, s23
vsub.f s6, s20, s23
vadd.f s16, s8, s12 @ vector op
vstr d0, [a1, #8 * 2*4]
vstr d2, [a1, #10 * 2*4]
vstr d1, [a1, #9 * 2*4]
vsub.f s20, s8, s12
vstr d3, [a1, #11 * 2*4]
@ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
vldr d12, [a1, #10 * 2*4]
vadd.f s0, s16, s18
vadd.f s1, s17, s19
vsub.f s6, s16, s18
vsub.f s7, s17, s19
vsub.f s3, s21, s22
vadd.f s2, s20, s23
vadd.f s5, s21, s22
vsub.f s4, s20, s23
vstr d0, [a1, #12 * 2*4]
vmov s0, s6
@ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
vldr d6, [a1, #9 * 2*4]
vstr d1, [a1, #13 * 2*4]
vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
vstr d2, [a1, #15 * 2*4]
vldr d7, [a1, #13 * 2*4]
vadd.f s4, s25, s24
vsub.f s5, s25, s24
vsub.f s6, s0, s7
vadd.f s7, s0, s7
vmul.f s20, s12, s3 @ vector op
@ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
vldr d4, [a1, #11 * 2*4]
vldr d5, [a1, #15 * 2*4]
vldr s1, cos3pi8
vmul.f s24, s4, s2 @ vector * scalar op
vmul.f s28, s12, s1 @ vector * scalar op
vmul.f s12, s8, s1 @ vector * scalar op
vadd.f s4, s20, s29
vsub.f s5, s21, s28
vsub.f s6, s22, s31
vadd.f s7, s23, s30
vmul.f s8, s8, s3 @ vector * scalar op
vldr d8, [a1, #1 * 2*4]
vldr d9, [a1, #5 * 2*4]
vldr d10, [a1, #3 * 2*4]
vldr d11, [a1, #7 * 2*4]
vldr d14, [a1, #2 * 2*4]
vadd.f s0, s6, s4
vadd.f s1, s5, s7
vsub.f s2, s5, s7
vsub.f s3, s6, s4
vadd.f s4, s12, s9
vsub.f s5, s13, s8
vsub.f s6, s14, s11
vadd.f s7, s15, s10
vadd.f s12, s0, s16 @ vector op
vstr d0, [a1, #1 * 2*4]
vstr d1, [a1, #5 * 2*4]
vldr d4, [a1, #1 * 2*4]
vldr d5, [a1, #5 * 2*4]
vadd.f s0, s6, s4
vadd.f s1, s5, s7
vsub.f s2, s5, s7
vsub.f s3, s6, s4
vsub.f s8, s16, s8 @ vector op
vstr d6, [a1, #1 * 2*4]
vstr d7, [a1, #5 * 2*4]
vldr d15, [a1, #6 * 2*4]
vsub.f s4, s20, s0
vsub.f s5, s21, s1
vsub.f s6, s22, s2
vsub.f s7, s23, s3
vadd.f s20, s0, s20 @ vector op
vstr d4, [a1, #9 * 2*4]
@ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
vldr d6, [a1, #8 * 2*4]
vstr d5, [a1, #13 * 2*4]
vldr d7, [a1, #12 * 2*4]
vstr d2, [a1, #11 * 2*4]
vldr d8, [a1, #0 * 2*4]
vstr d3, [a1, #15 * 2*4]
vldr d9, [a1, #4 * 2*4]
vadd.f s0, s26, s24
vadd.f s1, s25, s27
vsub.f s2, s25, s27
vsub.f s3, s26, s24
vadd.f s4, s14, s12
vadd.f s5, s13, s15
vsub.f s6, s13, s15
vsub.f s7, s14, s12
vadd.f s8, s0, s28 @ vector op
vstr d0, [a1, #3 * 2*4]
vstr d1, [a1, #7 * 2*4]
vldr d6, [a1, #3 * 2*4]
vldr d7, [a1, #7 * 2*4]
vsub.f s0, s16, s4
vsub.f s1, s17, s5
vsub.f s2, s18, s6
vsub.f s3, s19, s7
vsub.f s12, s28, s12 @ vector op
vadd.f s16, s4, s16 @ vector op
vstr d10, [a1, #3 * 2*4]
vstr d11, [a1, #7 * 2*4]
vstr d4, [a1, #2 * 2*4]
vstr d5, [a1, #6 * 2*4]
vstr d0, [a1, #8 * 2*4]
vstr d1, [a1, #12 * 2*4]
vstr d6, [a1, #10 * 2*4]
vstr d7, [a1, #14 * 2*4]
vstr d8, [a1, #0 * 2*4]
vstr d9, [a1, #4 * 2*4]
bx lr
endfunc
function ff_fft16_vfp, export=1
ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
fmrx a2, FPSCR
fmxr FPSCR, a3
vpush {s16-s31}
mov ip, lr
bl .Lfft16_internal_vfp
vpop {s16-s31}
fmxr FPSCR, a2
bx ip
endfunc
.macro pass n, z0, z1, z2, z3
add v6, v5, #4*2*\n
@ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
@ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
@ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
vldr d8, [\z2, #8*(o2+1)] @ s16,s17
vldmdb v6!, {s2}
vldr d9, [\z3, #8*(o3+1)] @ s18,s19
vldmia v5!, {s0,s1} @ s0 is unused
vldr s7, [\z2, #8*o2] @ t1
vmul.f s20, s16, s2 @ vector * scalar
vldr s0, [\z3, #8*o3] @ t5
vldr s6, [\z2, #8*o2+4] @ t2
vldr s3, [\z3, #8*o3+4] @ t6
vmul.f s16, s16, s1 @ vector * scalar
ldr a4, =\n-1
1: add \z0, \z0, #8*2
.if \n*4*2 >= 512
add \z1, \z1, #8*2
.endif
.if \n*4*2 >= 256
add \z2, \z2, #8*2
.endif
.if \n*4*2 >= 512
add \z3, \z3, #8*2
.endif
@ up to 2 stalls (VFP vector issuing / waiting for s0)
@ depending upon whether this is the first iteration and
@ how many add instructions are inserted above
vadd.f s4, s0, s7 @ t5
vadd.f s5, s6, s3 @ t6
vsub.f s6, s6, s3 @ t4
vsub.f s7, s0, s7 @ t3
vldr d6, [\z0, #8*0-8*2] @ s12,s13
vadd.f s0, s16, s21 @ t1
vldr d7, [\z1, #8*o1-8*2] @ s14,s15
vsub.f s1, s18, s23 @ t5
vadd.f s8, s4, s12 @ vector + vector
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
vsub.f s4, s12, s4
vsub.f s5, s13, s5
vsub.f s6, s14, s6
vsub.f s7, s15, s7
vsub.f s2, s17, s20 @ t2
vadd.f s3, s19, s22 @ t6
vstr d4, [\z0, #8*0-8*2] @ s8,s9
vstr d5, [\z1, #8*o1-8*2] @ s10,s11
@ stall (waiting for s5)
vstr d2, [\z2, #8*o2-8*2] @ s4,s5
vadd.f s4, s1, s0 @ t5
vstr d3, [\z3, #8*o3-8*2] @ s6,s7
vsub.f s7, s1, s0 @ t3
vadd.f s5, s2, s3 @ t6
vsub.f s6, s2, s3 @ t4
vldr d6, [\z0, #8*1-8*2] @ s12,s13
vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
vldr d4, [\z2, #8*o2] @ s8,s9
vldmdb v6!, {s2,s3}
vldr d5, [\z3, #8*o3] @ s10,s11
vadd.f s20, s4, s12 @ vector + vector
vldmia v5!, {s0,s1}
vldr d8, [\z2, #8*(o2+1)] @ s16,s17
@ stall (VFP vector issuing)
vsub.f s4, s12, s4
vsub.f s5, s13, s5
vsub.f s6, s14, s6
vsub.f s7, s15, s7
vmul.f s12, s8, s3 @ vector * scalar
vstr d10, [\z0, #8*1-8*2] @ s20,s21
vldr d9, [\z3, #8*(o3+1)] @ s18,s19
vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
vmul.f s8, s8, s0 @ vector * scalar
vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
@ stall (waiting for s7)
vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
vmul.f s20, s16, s2 @ vector * scalar
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
@ stall (VFP vector issuing)
vadd.f s7, s8, s13 @ t1
vsub.f s6, s9, s12 @ t2
vsub.f s0, s10, s15 @ t5
vadd.f s3, s11, s14 @ t6
vmul.f s16, s16, s1 @ vector * scalar
subs a4, a4, #1
bne 1b
@ What remains is identical to the first two indentations of
@ the above, but without the increment of z
vadd.f s4, s0, s7 @ t5
vadd.f s5, s6, s3 @ t6
vsub.f s6, s6, s3 @ t4
vsub.f s7, s0, s7 @ t3
vldr d6, [\z0, #8*0] @ s12,s13
vadd.f s0, s16, s21 @ t1
vldr d7, [\z1, #8*o1] @ s14,s15
vsub.f s1, s18, s23 @ t5
vadd.f s8, s4, s12 @ vector + vector
vsub.f s4, s12, s4
vsub.f s5, s13, s5
vsub.f s6, s14, s6
vsub.f s7, s15, s7
vsub.f s2, s17, s20 @ t2
vadd.f s3, s19, s22 @ t6
vstr d4, [\z0, #8*0] @ s8,s9
vstr d5, [\z1, #8*o1] @ s10,s11
vstr d2, [\z2, #8*o2] @ s4,s5
vadd.f s4, s1, s0 @ t5
vstr d3, [\z3, #8*o3] @ s6,s7
vsub.f s7, s1, s0 @ t3
vadd.f s5, s2, s3 @ t6
vsub.f s6, s2, s3 @ t4
vldr d6, [\z0, #8*1] @ s12,s13
vldr d7, [\z1, #8*(o1+1)] @ s14,s15
vadd.f s20, s4, s12 @ vector + vector
vsub.f s4, s12, s4
vsub.f s5, s13, s5
vsub.f s6, s14, s6
vsub.f s7, s15, s7
vstr d10, [\z0, #8*1] @ s20,s21
vstr d11, [\z1, #8*(o1+1)] @ s22,s23
vstr d2, [\z2, #8*(o2+1)] @ s4,s5
vstr d3, [\z3, #8*(o3+1)] @ s6,s7
.endm
.macro def_fft n, n2, n4
function .Lfft\n\()_internal_vfp
.if \n >= 512
push {v1-v6,lr}
.elseif \n >= 256
push {v1-v2,v5-v6,lr}
.else
push {v1,v5-v6,lr}
.endif
mov v1, a1
bl .Lfft\n2\()_internal_vfp
add a1, v1, #8*(\n/4)*2
bl .Lfft\n4\()_internal_vfp
movrelx v5, X(ff_cos_\n), a1
add a1, v1, #8*(\n/4)*3
bl .Lfft\n4\()_internal_vfp
.if \n >= 512
.set o1, 0*(\n/4/2)
.set o2, 0*(\n/4/2)
.set o3, 0*(\n/4/2)
add v2, v1, #8*2*(\n/4/2)
add v3, v1, #8*4*(\n/4/2)
add v4, v1, #8*6*(\n/4/2)
pass (\n/4/2), v1, v2, v3, v4
pop {v1-v6,pc}
.elseif \n >= 256
.set o1, 2*(\n/4/2)
.set o2, 0*(\n/4/2)
.set o3, 2*(\n/4/2)
add v2, v1, #8*4*(\n/4/2)
pass (\n/4/2), v1, v1, v2, v2
pop {v1-v2,v5-v6,pc}
.else
.set o1, 2*(\n/4/2)
.set o2, 4*(\n/4/2)
.set o3, 6*(\n/4/2)
pass (\n/4/2), v1, v1, v1, v1
pop {v1,v5-v6,pc}
.endif
endfunc
function fft\n\()_vfp
ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
fmrx a2, FPSCR
fmxr FPSCR, a3
vpush {s16-s31}
mov ip, lr
bl .Lfft\n\()_internal_vfp
vpop {s16-s31}
fmxr FPSCR, a2
bx ip
endfunc
.ltorg
.endm
def_fft 32, 16, 8
def_fft 64, 32, 16
def_fft 128, 64, 32
def_fft 256, 128, 64
def_fft 512, 256, 128
def_fft 1024, 512, 256
def_fft 2048, 1024, 512
def_fft 4096, 2048, 1024
def_fft 8192, 4096, 2048
def_fft 16384, 8192, 4096
def_fft 32768, 16384, 8192
def_fft 65536, 32768, 16384

View File

@ -1,301 +0,0 @@
/*
* ARM NEON optimised MDCT
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#define ff_fft_calc_neon X(ff_fft_calc_neon)
function ff_imdct_half_neon, export=1
push {r4-r8,lr}
mov r12, #1
ldr lr, [r0, #20] @ mdct_bits
ldr r4, [r0, #24] @ tcos
ldr r3, [r0, #8] @ revtab
lsl r12, r12, lr @ n = 1 << nbits
lsr lr, r12, #2 @ n4 = n >> 2
add r7, r2, r12, lsl #1
mov r12, #-16
sub r7, r7, #16
vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
vrev64.32 d17, d17
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
vmul.f32 d6, d17, d2
vmul.f32 d7, d0, d2
1:
subs lr, lr, #2
ldr r6, [r3], #4
vmul.f32 d4, d0, d3
vmul.f32 d5, d17, d3
vsub.f32 d4, d6, d4
vadd.f32 d5, d5, d7
uxth r8, r6, ror #16
uxth r6, r6
add r8, r1, r8, lsl #3
add r6, r1, r6, lsl #3
beq 1f
vld2.32 {d16-d17},[r7,:128],r12
vld2.32 {d0-d1}, [r2,:128]!
vrev64.32 d17, d17
vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
vmul.f32 d6, d17, d2
vmul.f32 d7, d0, d2
vst2.32 {d4[0],d5[0]}, [r6,:64]
vst2.32 {d4[1],d5[1]}, [r8,:64]
b 1b
1:
vst2.32 {d4[0],d5[0]}, [r6,:64]
vst2.32 {d4[1],d5[1]}, [r8,:64]
mov r4, r0
mov r6, r1
bl ff_fft_calc_neon
mov r12, #1
ldr lr, [r4, #20] @ mdct_bits
ldr r4, [r4, #24] @ tcos
lsl r12, r12, lr @ n = 1 << nbits
lsr lr, r12, #3 @ n8 = n >> 3
add r4, r4, lr, lsl #3
add r6, r6, lr, lsl #3
sub r1, r4, #16
sub r3, r6, #16
mov r7, #-16
mov r8, r6
mov r0, r3
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
1:
subs lr, lr, #2
vmul.f32 d7, d0, d18
vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
vmul.f32 d4, d1, d18
vmul.f32 d5, d21, d19
vmul.f32 d6, d20, d19
vmul.f32 d22, d1, d16
vmul.f32 d23, d21, d17
vmul.f32 d24, d0, d16
vmul.f32 d25, d20, d17
vadd.f32 d7, d7, d22
vadd.f32 d6, d6, d23
vsub.f32 d4, d4, d24
vsub.f32 d5, d5, d25
beq 1f
vld2.32 {d0-d1}, [r3,:128], r7
vld2.32 {d20-d21},[r6,:128]!
vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128], r7
vst2.32 {d5,d7}, [r8,:128]!
b 1b
1:
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128]
vst2.32 {d5,d7}, [r8,:128]
pop {r4-r8,pc}
endfunc
function ff_imdct_calc_neon, export=1
push {r4-r6,lr}
ldr r3, [r0, #20]
mov r4, #1
mov r5, r1
lsl r4, r4, r3
add r1, r1, r4
bl X(ff_imdct_half_neon)
add r0, r5, r4, lsl #2
add r1, r5, r4, lsl #1
sub r0, r0, #8
sub r2, r1, #16
mov r3, #-16
mov r6, #-8
vmov.i32 d30, #1<<31
1:
vld1.32 {d0-d1}, [r2,:128], r3
pld [r0, #-16]
vrev64.32 q0, q0
vld1.32 {d2-d3}, [r1,:128]!
veor d4, d1, d30
pld [r2, #-16]
vrev64.32 q1, q1
veor d5, d0, d30
vst1.32 {d2}, [r0,:64], r6
vst1.32 {d3}, [r0,:64], r6
vst1.32 {d4-d5}, [r5,:128]!
subs r4, r4, #16
bgt 1b
pop {r4-r6,pc}
endfunc
function ff_mdct_calc_neon, export=1
push {r4-r10,lr}
mov r12, #1
ldr lr, [r0, #20] @ mdct_bits
ldr r4, [r0, #24] @ tcos
ldr r3, [r0, #8] @ revtab
lsl lr, r12, lr @ n = 1 << nbits
add r7, r2, lr @ in4u
sub r9, r7, #16 @ in4d
add r2, r7, lr, lsl #1 @ in3u
add r8, r9, lr, lsl #1 @ in3d
add r5, r4, lr, lsl #1
sub r5, r5, #16
sub r3, r3, #4
mov r12, #-16
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
vsub.f32 d0, d18, d0 @ in4d-in4u I
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
vadd.f32 d1, d1, d19 @ in3u+in3d -R
vsub.f32 d16, d16, d2 @ in0u-in2d R
vadd.f32 d17, d17, d3 @ in2u+in1d -I
1:
vmul.f32 d7, d0, d21 @ I*s
A ldr r10, [r3, lr, lsr #1]
T lsr r10, lr, #1
T ldr r10, [r3, r10]
vmul.f32 d6, d1, d20 @ -R*c
ldr r6, [r3, #4]!
vmul.f32 d4, d1, d21 @ -R*s
vmul.f32 d5, d0, d20 @ I*c
vmul.f32 d24, d16, d30 @ R*c
vmul.f32 d25, d17, d31 @ -I*s
vmul.f32 d22, d16, d31 @ R*s
vmul.f32 d23, d17, d30 @ I*c
subs lr, lr, #16
vsub.f32 d6, d6, d7 @ -R*c-I*s
vadd.f32 d7, d4, d5 @ -R*s+I*c
vsub.f32 d24, d25, d24 @ I*s-R*c
vadd.f32 d25, d22, d23 @ R*s-I*c
beq 1f
mov r12, #-16
vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
vneg.f32 d7, d7 @ R*s-I*c
vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
vsub.f32 d0, d18, d0 @ in4d-in4u I
vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
vadd.f32 d1, d1, d19 @ in3u+in3d -R
vsub.f32 d16, d16, d2 @ in0u-in2d R
vadd.f32 d17, d17, d3 @ in2u+in1d -I
uxth r12, r6, ror #16
uxth r6, r6
add r12, r1, r12, lsl #3
add r6, r1, r6, lsl #3
vst2.32 {d6[0],d7[0]}, [r6,:64]
vst2.32 {d6[1],d7[1]}, [r12,:64]
uxth r6, r10, ror #16
uxth r10, r10
add r6 , r1, r6, lsl #3
add r10, r1, r10, lsl #3
vst2.32 {d24[0],d25[0]},[r10,:64]
vst2.32 {d24[1],d25[1]},[r6,:64]
b 1b
1:
vneg.f32 d7, d7 @ R*s-I*c
uxth r12, r6, ror #16
uxth r6, r6
add r12, r1, r12, lsl #3
add r6, r1, r6, lsl #3
vst2.32 {d6[0],d7[0]}, [r6,:64]
vst2.32 {d6[1],d7[1]}, [r12,:64]
uxth r6, r10, ror #16
uxth r10, r10
add r6 , r1, r6, lsl #3
add r10, r1, r10, lsl #3
vst2.32 {d24[0],d25[0]},[r10,:64]
vst2.32 {d24[1],d25[1]},[r6,:64]
mov r4, r0
mov r6, r1
bl ff_fft_calc_neon
mov r12, #1
ldr lr, [r4, #20] @ mdct_bits
ldr r4, [r4, #24] @ tcos
lsl r12, r12, lr @ n = 1 << nbits
lsr lr, r12, #3 @ n8 = n >> 3
add r4, r4, lr, lsl #3
add r6, r6, lr, lsl #3
sub r1, r4, #16
sub r3, r6, #16
mov r7, #-16
mov r8, r6
mov r0, r3
vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
1:
subs lr, lr, #2
vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
vneg.f32 q2, q2
beq 1f
vld2.32 {d0-d1}, [r3,:128], r7
vld2.32 {d20-d21},[r6,:128]!
vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128], r7
vst2.32 {d5,d7}, [r8,:128]!
b 1b
1:
vrev64.32 q3, q3
vst2.32 {d4,d6}, [r0,:128]
vst2.32 {d5,d7}, [r8,:128]
pop {r4-r10,pc}
endfunc

View File

@ -1,347 +0,0 @@
/*
* Copyright (c) 2013 RISC OS Open Ltd
* Author: Ben Avison <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
CONTEXT .req a1
ORIGOUT .req a2
IN .req a3
OUT .req v1
REVTAB .req v2
TCOS .req v3
TSIN .req v4
OLDFPSCR .req v5
J0 .req a2
J1 .req a4
J2 .req ip
J3 .req lr
REVTAB_HI .req v5
IN_HI .req v6
OUT_HI .req v6
TCOS_HI .req sl
TSIN_HI .req fp
.macro prerotation_innerloop
.set trig_lo, k
.set trig_hi, n4 - k - 2
.set in_lo, trig_lo * 2
.set in_hi, trig_hi * 2
vldr d8, [TCOS, #trig_lo*4] @ s16,s17
vldr d9, [TCOS, #trig_hi*4] @ s18,s19
vldr s0, [IN, #in_hi*4 + 12]
vldr s1, [IN, #in_hi*4 + 4]
vldr s2, [IN, #in_lo*4 + 12]
vldr s3, [IN, #in_lo*4 + 4]
vmul.f s8, s0, s16 @ vector operation
vldr d10, [TSIN, #trig_lo*4] @ s20,s21
vldr d11, [TSIN, #trig_hi*4] @ s22,s23
vldr s4, [IN, #in_lo*4]
vldr s5, [IN, #in_lo*4 + 8]
vldr s6, [IN, #in_hi*4]
vldr s7, [IN, #in_hi*4 + 8]
ldr J0, [REVTAB, #trig_lo*2]
vmul.f s12, s0, s20 @ vector operation
ldr J2, [REVTAB, #trig_hi*2]
mov J1, J0, lsr #16
and J0, J0, #255 @ halfword value will be < n4
vmls.f s8, s4, s20 @ vector operation
mov J3, J2, lsr #16
and J2, J2, #255 @ halfword value will be < n4
add J0, OUT, J0, lsl #3
vmla.f s12, s4, s16 @ vector operation
add J1, OUT, J1, lsl #3
add J2, OUT, J2, lsl #3
add J3, OUT, J3, lsl #3
vstr s8, [J0]
vstr s9, [J1]
vstr s10, [J2]
vstr s11, [J3]
vstr s12, [J0, #4]
vstr s13, [J1, #4]
vstr s14, [J2, #4]
vstr s15, [J3, #4]
.set k, k + 2
.endm
.macro prerotation_innerloop_rolled
vldmia TCOS!, {s16,s17}
vldmdb TCOS_HI!, {s18,s19}
vldr s0, [IN_HI, #-4]
vldr s1, [IN_HI, #-12]
vldr s2, [IN, #12]
vldr s3, [IN, #4]
vmul.f s8, s0, s16 @ vector operation
vldmia TSIN!, {s20,s21}
vldmdb TSIN_HI!, {s22,s23}
vldr s4, [IN]
vldr s5, [IN, #8]
vldr s6, [IN_HI, #-16]
vldr s7, [IN_HI, #-8]
vmul.f s12, s0, s20 @ vector operation
add IN, IN, #16
sub IN_HI, IN_HI, #16
ldrh J0, [REVTAB], #2
ldrh J1, [REVTAB], #2
vmls.f s8, s4, s20 @ vector operation
ldrh J3, [REVTAB_HI, #-2]!
ldrh J2, [REVTAB_HI, #-2]!
add J0, OUT, J0, lsl #3
vmla.f s12, s4, s16 @ vector operation
add J1, OUT, J1, lsl #3
add J2, OUT, J2, lsl #3
add J3, OUT, J3, lsl #3
vstr s8, [J0]
vstr s9, [J1]
vstr s10, [J2]
vstr s11, [J3]
vstr s12, [J0, #4]
vstr s13, [J1, #4]
vstr s14, [J2, #4]
vstr s15, [J3, #4]
.endm
.macro postrotation_innerloop tail, head
.set trig_lo_head, n8 - k - 2
.set trig_hi_head, n8 + k
.set out_lo_head, trig_lo_head * 2
.set out_hi_head, trig_hi_head * 2
.set trig_lo_tail, n8 - (k - 2) - 2
.set trig_hi_tail, n8 + (k - 2)
.set out_lo_tail, trig_lo_tail * 2
.set out_hi_tail, trig_hi_tail * 2
.if (k & 2) == 0
TCOS_D0_HEAD .req d10 @ s20,s21
TCOS_D1_HEAD .req d11 @ s22,s23
TCOS_S0_TAIL .req s24
.else
TCOS_D0_HEAD .req d12 @ s24,s25
TCOS_D1_HEAD .req d13 @ s26,s27
TCOS_S0_TAIL .req s20
.endif
.ifnc "\tail",""
vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
.endif
.ifnc "\head",""
vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
.endif
.ifnc "\tail",""
vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
.endif
.ifnc "\head",""
vldr s0, [OUT, #out_lo_head*4]
vldr s1, [OUT, #out_lo_head*4 + 8]
vldr s2, [OUT, #out_hi_head*4]
vldr s3, [OUT, #out_hi_head*4 + 8]
vldr s4, [OUT, #out_lo_head*4 + 4]
vldr s5, [OUT, #out_lo_head*4 + 12]
vldr s6, [OUT, #out_hi_head*4 + 4]
vldr s7, [OUT, #out_hi_head*4 + 12]
.endif
.ifnc "\tail",""
vstr s8, [OUT, #out_lo_tail*4]
vstr s9, [OUT, #out_lo_tail*4 + 8]
vstr s10, [OUT, #out_hi_tail*4]
vstr s11, [OUT, #out_hi_tail*4 + 8]
.endif
.ifnc "\head",""
vmul.f s8, s4, s16 @ vector operation
.endif
.ifnc "\tail",""
vstr s12, [OUT, #out_hi_tail*4 + 12]
vstr s13, [OUT, #out_hi_tail*4 + 4]
vstr s14, [OUT, #out_lo_tail*4 + 12]
vstr s15, [OUT, #out_lo_tail*4 + 4]
.endif
.ifnc "\head",""
vmul.f s12, s0, s16 @ vector operation
vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
.endif
.unreq TCOS_D0_HEAD
.unreq TCOS_D1_HEAD
.unreq TCOS_S0_TAIL
.ifnc "\head",""
.set k, k + 2
.endif
.endm
.macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
.ifnc "\tail",""
vmls.f s8, s0, \tcos_s0_tail @ vector operation
.endif
.ifnc "\head",""
vldmia TSIN!, {s16,s17}
vldmdb TSIN_HI!, {s18,s19}
vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head}
.endif
.ifnc "\tail",""
vmla.f s12, s4, \tcos_s0_tail @ vector operation
.endif
.ifnc "\head",""
vldr s0, [OUT, #+\out_offset_head+0]
vldr s1, [OUT, #+\out_offset_head+8]
vldr s2, [OUT_HI, #-\out_offset_head-16]
vldr s3, [OUT_HI, #-\out_offset_head-8]
vldr s4, [OUT, #+\out_offset_head+4]
vldr s5, [OUT, #+\out_offset_head+12]
vldr s6, [OUT_HI, #-\out_offset_head-12]
vldr s7, [OUT_HI, #-\out_offset_head-4]
.endif
.ifnc "\tail",""
vstr s8, [OUT, #+\out_offset_tail+0]
vstr s9, [OUT, #+\out_offset_tail+8]
vstr s10, [OUT_HI, #-\out_offset_tail-16]
vstr s11, [OUT_HI, #-\out_offset_tail-8]
.endif
.ifnc "\head",""
vmul.f s8, s4, s16 @ vector operation
.endif
.ifnc "\tail",""
vstr s12, [OUT_HI, #-\out_offset_tail-4]
vstr s13, [OUT_HI, #-\out_offset_tail-12]
vstr s14, [OUT, #+\out_offset_tail+12]
vstr s15, [OUT, #+\out_offset_tail+4]
.endif
.ifnc "\head",""
vmul.f s12, s0, s16 @ vector operation
vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
.endif
.endm
/* void ff_imdct_half_vfp(FFTContext *s,
* FFTSample *output,
* const FFTSample *input)
*/
function ff_imdct_half_vfp, export=1
ldr ip, [CONTEXT, #5*4] @ mdct_bits
teq ip, #6
bne 10f
.set n, 1<<6
.set n2, n/2
.set n4, n/4
.set n8, n/8
push {v1-v5,lr}
vpush {s16-s27}
fmrx OLDFPSCR, FPSCR
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
mov OUT, ORIGOUT
ldr REVTAB, [CONTEXT, #2*4]
ldr TCOS, [CONTEXT, #6*4]
ldr TSIN, [CONTEXT, #7*4]
.set k, 0
.rept n8/2
prerotation_innerloop
.endr
fmxr FPSCR, OLDFPSCR
mov a1, OUT
bl X(ff_fft16_vfp)
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
.set k, 0
postrotation_innerloop , head
.rept n8/2 - 1
postrotation_innerloop tail, head
.endr
postrotation_innerloop tail
fmxr FPSCR, OLDFPSCR
vpop {s16-s27}
pop {v1-v5,pc}
10:
push {v1-v6,sl,fp,lr}
vpush {s16-s27}
fmrx OLDFPSCR, FPSCR
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
mov lr, #1
mov OUT, ORIGOUT
ldr REVTAB, [CONTEXT, #2*4]
ldr TCOS, [CONTEXT, #6*4]
ldr TSIN, [CONTEXT, #7*4]
mov lr, lr, lsl ip
push {CONTEXT,OLDFPSCR}
add IN_HI, IN, lr, lsl #1
add REVTAB_HI, REVTAB, lr, lsr #1
add TCOS_HI, TCOS, lr
add TSIN_HI, TSIN, lr
0: prerotation_innerloop_rolled
teq IN, IN_HI
bne 0b
ldmia sp, {CONTEXT,OLDFPSCR}
mov ORIGOUT, OUT
fmxr FPSCR, OLDFPSCR
ldr ip, [CONTEXT, #9*4]
blx ip @ s->fft_calc(s, output)
pop {CONTEXT,OLDFPSCR}
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
ldr ip, [CONTEXT, #5*4] @ mdct_bits
fmxr FPSCR, lr
mov lr, #1
mov lr, lr, lsl ip
sub TCOS, TCOS, lr, lsr #1
sub TSIN, TSIN, lr, lsr #1
add OUT_HI, OUT, lr, lsl #1
add TCOS_HI, TCOS, lr
add TSIN_HI, TSIN, lr
postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
b 1f
0: add OUT, OUT, #32
sub OUT_HI, OUT_HI, #32
postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
teq TSIN, TSIN_HI
bne 0b
postrotation_innerloop_rolled tail,,,,,, s24,, 16
fmxr FPSCR, OLDFPSCR
vpop {s16-s27}
pop {v1-v6,sl,fp,pc}
endfunc
.unreq CONTEXT
.unreq ORIGOUT
.unreq IN
.unreq OUT
.unreq REVTAB
.unreq TCOS
.unreq TSIN
.unreq OLDFPSCR
.unreq J0
.unreq J1
.unreq J2
.unreq J3
.unreq REVTAB_HI
.unreq IN_HI
.unreq OUT_HI
.unreq TCOS_HI
.unreq TSIN_HI

View File

@ -1,33 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/arm/cpu.h"
#include "libavcodec/rdft.h"
void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
av_cold void ff_rdft_init_arm(RDFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
s->rdft_calc = ff_rdft_calc_neon;
}

View File

@ -1,155 +0,0 @@
/*
* ARM NEON optimised RDFT
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
function ff_rdft_calc_neon, export=1
push {r4-r8,lr}
ldr r6, [r0, #4] @ inverse
mov r4, r0
mov r5, r1
lsls r6, r6, #31
bne 1f
add r0, r4, #24
bl X(ff_fft_permute_neon)
add r0, r4, #24
mov r1, r5
bl X(ff_fft_calc_neon)
1:
ldr r12, [r4, #0] @ nbits
mov r2, #1
ldr r8, [r4, #20] @ negative_sin
lsl r12, r2, r12
add r0, r5, #8
lsl r8, r8, #31
add r1, r5, r12, lsl #2
lsr r12, r12, #2
vdup.32 d26, r8
ldr r2, [r4, #12] @ tcos
sub r12, r12, #2
ldr r3, [r4, #16] @ tsin
mov r7, r0
sub r1, r1, #8
mov lr, r1
mov r8, #-8
vld1.32 {d0}, [r0,:64]! @ d1[0,1]
vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
vld1.32 {d4}, [r2,:64]! @ tcos[i]
vld1.32 {d5}, [r3,:64]! @ tsin[i]
vmov.f32 d18, #0.5 @ k1
vdup.32 d19, r6
veor d5, d26, d5
pld [r0, #32]
veor d19, d18, d19 @ k2
vmov.i32 d16, #0
vmov.i32 d17, #1<<31
pld [r1, #-32]
vtrn.32 d16, d17
pld [r2, #32]
vrev64.32 d16, d16 @ d16=1,0 d17=0,1
pld [r3, #32]
2:
veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
vld1.32 {d24}, [r0,:64]! @ d1[0,1]
vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
vld1.32 {d25}, [r1,:64], r8 @ d2[0,1]
vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1]
pld [r0, #32]
vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
pld [r1, #-32]
vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1]
vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1]
vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re
veor d7, d21, d16 @ -od.im, od.re
vrev64.32 d3, d21 @ od.re, od.im
veor d6, d20, d17 @ ev.re,-ev.im
veor d2, d3, d16 @ -od.re, od.im
vmla.f32 d20, d3, d4[1]
vmla.f32 d20, d7, d5[1]
vmla.f32 d6, d2, d4[1]
vmla.f32 d6, d21, d5[1]
vld1.32 {d4}, [r2,:64]! @ tcos[i]
veor d7, d23, d16 @ -od.im, od.re
vld1.32 {d5}, [r3,:64]! @ tsin[i]
veor d24, d22, d17 @ ev.re,-ev.im
vrev64.32 d3, d23 @ od.re, od.im
veor d5, d26, d5
pld [r2, #32]
veor d2, d3, d16 @ -od.re, od.im
pld [r3, #32]
vmla.f32 d22, d3, d4[0]
vmla.f32 d22, d7, d5[0]
vmla.f32 d24, d2, d4[0]
vmla.f32 d24, d23, d5[0]
vld1.32 {d0}, [r0,:64]! @ d1[0,1]
vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
vst1.32 {d20}, [r7,:64]!
vst1.32 {d6}, [lr,:64], r8
vst1.32 {d22}, [r7,:64]!
vst1.32 {d24}, [lr,:64], r8
subs r12, r12, #2
bgt 2b
veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
ldr r2, [r4, #8] @ sign_convention
vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
add r0, r0, #4
bfc r2, #0, #31
vld1.32 {d0[0]}, [r0,:32]
veor d7, d21, d16 @ -od.im, od.re
vrev64.32 d3, d21 @ od.re, od.im
veor d6, d20, d17 @ ev.re,-ev.im
vld1.32 {d22}, [r5,:64]
vdup.32 d1, r2
vmov d23, d22
veor d2, d3, d16 @ -od.re, od.im
vtrn.32 d22, d23
veor d0, d0, d1
veor d23, d23, d17
vmla.f32 d20, d3, d4[1]
vmla.f32 d20, d7, d5[1]
vmla.f32 d6, d2, d4[1]
vmla.f32 d6, d21, d5[1]
vadd.f32 d22, d22, d23
vst1.32 {d20}, [r7,:64]
vst1.32 {d6}, [lr,:64]
vst1.32 {d0[0]}, [r0,:32]
vst1.32 {d22}, [r5,:64]
cmp r6, #0
it eq
popeq {r4-r8,pc}
vmul.f32 d22, d22, d18
vst1.32 {d22}, [r5,:64]
add r0, r4, #24
mov r1, r5
bl X(ff_fft_permute_neon)
add r0, r4, #24
mov r1, r5
pop {r4-r8,lr}
b X(ff_fft_calc_neon)
endfunc

View File

@ -23,7 +23,6 @@
#include "libavutil/arm/cpu.h"
#include "libavutil/attributes.h"
#include "libavutil/internal.h"
#include "libavcodec/fft.h"
#include "libavcodec/synth_filter.h"
void ff_synth_filter_float_vfp(AVTXContext *imdct,

View File

@ -1,80 +0,0 @@
/*
* Generate a header file for hardcoded ff_cos_* tables
*
* Copyright (c) 2009 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "libavutil/mathematics.h"
#define BITS 17
#define FLOATFMT "%.18e"
#define FIXEDFMT "%6d"
static int clip_f15(int v)
{
return v < -32767 ? -32767 :
v > 32767 ? 32767 :
v;
}
static void printval(double val, int fixed)
{
if (fixed) {
/* lrint() isn't always available, so round and cast manually. */
double new_val = val * (double) (1 << 15);
new_val = new_val >= 0 ? floor(new_val + 0.5) : ceil(new_val - 0.5);
printf(" "FIXEDFMT",", clip_f15((long int) new_val));
} else {
printf(" "FLOATFMT",", val);
}
}
int main(int argc, char *argv[])
{
int i, j;
int do_sin = argc > 1 && !strcmp(argv[1], "sin");
int fixed = argc > 1 && strstr(argv[1], "fixed");
double (*func)(double) = do_sin ? sin : cos;
printf("/* This file was automatically generated. */\n");
printf("#define FFT_FLOAT %d\n", !fixed);
printf("#include \"libavcodec/%s\"\n", do_sin ? "rdft.h" : "fft.h");
for (i = 4; i <= BITS; i++) {
int m = 1 << i;
double freq = 2*M_PI/m;
printf("%s(%i) = {\n ", do_sin ? "SINTABLE" : "COSTABLE", m);
for (j = 0; j < m/2 - 1; j++) {
int idx = j > m/4 ? m/2 - j : j;
if (do_sin && j >= m/4)
idx = m/4 - j;
printval(func(idx*freq), fixed);
if ((j & 3) == 3)
printf("\n ");
}
printval(func(do_sin ? -(m/4 - 1)*freq : freq), fixed);
printf("\n};\n");
}
return 0;
}

View File

@ -1,228 +0,0 @@
/*
* (I)DCT Transforms
* Copyright (c) 2009 Peter Ross <pross@xvid.org>
* Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
* Copyright (c) 2010 Vitor Sessak
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* (Inverse) Discrete Cosine Transforms. These are also known as the
* type II and type III DCTs respectively.
*/
#include <math.h>
#include <string.h>
#include "libavutil/error.h"
#include "libavutil/mathematics.h"
#include "libavutil/mem.h"
#include "dct.h"
#include "dct32.h"
/* sin((M_PI * x / (2 * n)) */
#define SIN(s, n, x) (s->costab[(n) - (x)])
/* cos((M_PI * x / (2 * n)) */
#define COS(s, n, x) (s->costab[x])
static void dst_calc_I_c(DCTContext *ctx, FFTSample *data)
{
int n = 1 << ctx->nbits;
int i;
data[0] = 0;
for (i = 1; i < n / 2; i++) {
float tmp1 = data[i ];
float tmp2 = data[n - i];
float s = SIN(ctx, n, 2 * i);
s *= tmp1 + tmp2;
tmp1 = (tmp1 - tmp2) * 0.5f;
data[i] = s + tmp1;
data[n - i] = s - tmp1;
}
data[n / 2] *= 2;
ctx->rdft.rdft_calc(&ctx->rdft, data);
data[0] *= 0.5f;
for (i = 1; i < n - 2; i += 2) {
data[i + 1] += data[i - 1];
data[i] = -data[i + 2];
}
data[n - 1] = 0;
}
static void dct_calc_I_c(DCTContext *ctx, FFTSample *data)
{
int n = 1 << ctx->nbits;
int i;
float next = -0.5f * (data[0] - data[n]);
for (i = 0; i < n / 2; i++) {
float tmp1 = data[i];
float tmp2 = data[n - i];
float s = SIN(ctx, n, 2 * i);
float c = COS(ctx, n, 2 * i);
c *= tmp1 - tmp2;
s *= tmp1 - tmp2;
next += c;
tmp1 = (tmp1 + tmp2) * 0.5f;
data[i] = tmp1 - s;
data[n - i] = tmp1 + s;
}
ctx->rdft.rdft_calc(&ctx->rdft, data);
data[n] = data[1];
data[1] = next;
for (i = 3; i <= n; i += 2)
data[i] = data[i - 2] - data[i];
}
static void dct_calc_III_c(DCTContext *ctx, FFTSample *data)
{
int n = 1 << ctx->nbits;
int i;
float next = data[n - 1];
float inv_n = 1.0f / n;
for (i = n - 2; i >= 2; i -= 2) {
float val1 = data[i];
float val2 = data[i - 1] - data[i + 1];
float c = COS(ctx, n, i);
float s = SIN(ctx, n, i);
data[i] = c * val1 + s * val2;
data[i + 1] = s * val1 - c * val2;
}
data[1] = 2 * next;
ctx->rdft.rdft_calc(&ctx->rdft, data);
for (i = 0; i < n / 2; i++) {
float tmp1 = data[i] * inv_n;
float tmp2 = data[n - i - 1] * inv_n;
float csc = ctx->csc2[i] * (tmp1 - tmp2);
tmp1 += tmp2;
data[i] = tmp1 + csc;
data[n - i - 1] = tmp1 - csc;
}
}
static void dct_calc_II_c(DCTContext *ctx, FFTSample *data)
{
int n = 1 << ctx->nbits;
int i;
float next;
for (i = 0; i < n / 2; i++) {
float tmp1 = data[i];
float tmp2 = data[n - i - 1];
float s = SIN(ctx, n, 2 * i + 1);
s *= tmp1 - tmp2;
tmp1 = (tmp1 + tmp2) * 0.5f;
data[i] = tmp1 + s;
data[n-i-1] = tmp1 - s;
}
ctx->rdft.rdft_calc(&ctx->rdft, data);
next = data[1] * 0.5;
data[1] *= -1;
for (i = n - 2; i >= 0; i -= 2) {
float inr = data[i ];
float ini = data[i + 1];
float c = COS(ctx, n, i);
float s = SIN(ctx, n, i);
data[i] = c * inr + s * ini;
data[i + 1] = next;
next += s * inr - c * ini;
}
}
static void dct32_func(DCTContext *ctx, FFTSample *data)
{
ctx->dct32(data, data);
}
av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
{
int n = 1 << nbits;
int i;
int ret;
memset(s, 0, sizeof(*s));
s->nbits = nbits;
s->inverse = inverse;
if (inverse == DCT_II && nbits == 5) {
s->dct_calc = dct32_func;
} else {
ff_init_ff_cos_tabs(nbits + 2);
s->costab = ff_cos_tabs[nbits + 2];
s->csc2 = av_malloc_array(n / 2, sizeof(FFTSample));
if (!s->csc2)
return AVERROR(ENOMEM);
if ((ret = ff_rdft_init(&s->rdft, nbits, inverse == DCT_III)) < 0) {
av_freep(&s->csc2);
return ret;
}
for (i = 0; i < n / 2; i++)
s->csc2[i] = 0.5 / sin((M_PI / (2 * n) * (2 * i + 1)));
switch (inverse) {
case DCT_I : s->dct_calc = dct_calc_I_c; break;
case DCT_II : s->dct_calc = dct_calc_II_c; break;
case DCT_III: s->dct_calc = dct_calc_III_c; break;
case DST_I : s->dct_calc = dst_calc_I_c; break;
}
}
s->dct32 = ff_dct32_float;
#if ARCH_X86
ff_dct_init_x86(s);
#endif
return 0;
}
av_cold void ff_dct_end(DCTContext *s)
{
ff_rdft_end(&s->rdft);
av_freep(&s->csc2);
}

View File

@ -21,37 +21,12 @@
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#if !defined(AVCODEC_DCT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
#ifndef AVCODEC_DCT_H
#define AVCODEC_DCT_H
#include <stddef.h>
#include <stdint.h>
#include "rdft.h"
struct DCTContext {
int nbits;
int inverse;
RDFTContext rdft;
const float *costab;
FFTSample *csc2;
void (*dct_calc)(struct DCTContext *s, FFTSample *data);
void (*dct32)(FFTSample *out, const FFTSample *in);
};
/**
* Set up DCT.
* @param nbits size of the input array:
* (1 << nbits) for DCT-II, DCT-III and DST-I
* (1 << nbits) + 1 for DCT-I
*
* @note the first element of the input of DST-I is ignored
*/
int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType type);
void ff_dct_end (DCTContext *s);
void ff_dct_init_x86(DCTContext *s);
void ff_j_rev_dct(int16_t *data);
void ff_j_rev_dct4(int16_t *data);
void ff_j_rev_dct2(int16_t *data);

View File

@ -1,62 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_FFT_INTERNAL_H
#define AVCODEC_FFT_INTERNAL_H
#include "libavutil/mathematics.h"
#include "fft.h"
#if FFT_FLOAT
#define FIX15(v) (v)
#define sqrthalf (float)M_SQRT1_2
#define BF(x, y, a, b) do { \
x = a - b; \
y = a + b; \
} while (0)
#define CMUL(dre, dim, are, aim, bre, bim) do { \
(dre) = (are) * (bre) - (aim) * (bim); \
(dim) = (are) * (bim) + (aim) * (bre); \
} while (0)
#else /* FFT_FLOAT */
#define CMUL(dre, dim, are, aim, bre, bim) do { \
int64_t accu; \
(accu) = (int64_t)(bre) * (are); \
(accu) -= (int64_t)(bim) * (aim); \
(dre) = (int)(((accu) + 0x40000000) >> 31); \
(accu) = (int64_t)(bre) * (aim); \
(accu) += (int64_t)(bim) * (are); \
(dim) = (int)(((accu) + 0x40000000) >> 31); \
} while (0)
#endif /* FFT_FLOAT */
#define ff_imdct_calc_c FFT_NAME(ff_imdct_calc_c)
#define ff_imdct_half_c FFT_NAME(ff_imdct_half_c)
#define ff_mdct_calc_c FFT_NAME(ff_mdct_calc_c)
void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
#endif /* AVCODEC_FFT_INTERNAL_H */

View File

@ -1,160 +0,0 @@
/*
* Copyright (c) 2000, 2001, 2002 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_FFT_H
#define AVCODEC_FFT_H
#ifndef FFT_FLOAT
#define FFT_FLOAT 1
#endif
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes_internal.h"
#include "libavutil/mem_internal.h"
#if FFT_FLOAT
#include "avfft.h"
#define FFT_NAME(x) x
typedef float FFTDouble;
#else
#define Q31(x) (int)((x)*2147483648.0 + 0.5)
#define FFT_NAME(x) x ## _fixed_32
typedef int32_t FFTSample;
typedef struct FFTComplex {
FFTSample re, im;
} FFTComplex;
typedef int FFTDouble;
typedef struct FFTContext FFTContext;
#endif /* FFT_FLOAT */
typedef struct FFTDComplex {
FFTDouble re, im;
} FFTDComplex;
/* FFT computation */
enum fft_permutation_type {
FF_FFT_PERM_DEFAULT,
FF_FFT_PERM_SWAP_LSBS,
FF_FFT_PERM_AVX,
};
enum mdct_permutation_type {
FF_MDCT_PERM_NONE,
FF_MDCT_PERM_INTERLEAVE,
};
struct FFTContext {
int nbits;
int inverse;
uint16_t *revtab;
FFTComplex *tmp_buf;
int mdct_size; /* size of MDCT (i.e. number of input data * 2) */
int mdct_bits; /* n = 2^nbits */
/* pre/post rotation tables */
FFTSample *tcos;
FFTSample *tsin;
/**
* Do the permutation needed BEFORE calling fft_calc().
*/
void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
/**
* Do a complex FFT with the parameters defined in ff_fft_init(). The
* input data must be permuted before. No 1.0/sqrt(n) normalization is done.
*/
void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
enum fft_permutation_type fft_permutation;
enum mdct_permutation_type mdct_permutation;
uint32_t *revtab32;
};
#if CONFIG_HARDCODED_TABLES
#define COSTABLE_CONST const
#define ff_init_ff_cos_tabs(index)
#else
#define COSTABLE_CONST
#define ff_init_ff_cos_tabs FFT_NAME(ff_init_ff_cos_tabs)
/**
* Initialize the cosine table in ff_cos_tabs[index]
* @param index index in ff_cos_tabs array of the table to initialize
*/
void ff_init_ff_cos_tabs(int index);
#endif
#define COSTABLE(size) \
COSTABLE_CONST attribute_visibility_hidden DECLARE_ALIGNED(32, FFTSample, FFT_NAME(ff_cos_##size))[size/2]
extern COSTABLE(16);
extern COSTABLE(32);
extern COSTABLE(64);
extern COSTABLE(128);
extern COSTABLE(256);
extern COSTABLE(512);
extern COSTABLE(1024);
extern COSTABLE(2048);
extern COSTABLE(4096);
extern COSTABLE(8192);
extern COSTABLE(16384);
extern COSTABLE(32768);
extern COSTABLE(65536);
extern COSTABLE(131072);
extern COSTABLE_CONST FFTSample* const FFT_NAME(ff_cos_tabs)[18];
#define ff_fft_init FFT_NAME(ff_fft_init)
#define ff_fft_end FFT_NAME(ff_fft_end)
/**
* Set up a complex FFT.
* @param nbits log2 of the length of the input array
* @param inverse if 0 perform the forward transform, if 1 perform the inverse
*/
int ff_fft_init(FFTContext *s, int nbits, int inverse);
void ff_fft_init_aarch64(FFTContext *s);
void ff_fft_init_x86(FFTContext *s);
void ff_fft_init_arm(FFTContext *s);
void ff_fft_init_mips(FFTContext *s);
void ff_fft_init_ppc(FFTContext *s);
void ff_fft_end(FFTContext *s);
#define ff_mdct_init FFT_NAME(ff_mdct_init)
#define ff_mdct_end FFT_NAME(ff_mdct_end)
int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale);
void ff_mdct_end(FFTContext *s);
#endif /* AVCODEC_FFT_H */

View File

@ -1,51 +0,0 @@
/*
* Copyright (c) 2012
* MIPS Technologies, Inc., California.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* Authors: Stanislav Ocovaj (socovaj@mips.com)
* Goran Cordasic (goran@mips.com)
* Djordje Pesut (djordje@mips.com)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define FFT_FLOAT 0
#include "fft_template.c"

View File

@ -1,20 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define FFT_FLOAT 1
#include "fft_template.c"

View File

@ -1,344 +0,0 @@
/*
* Copyright (c) 2012
* MIPS Technologies, Inc., California.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* Authors: Stanislav Ocovaj (socovaj@mips.com)
* Goran Cordasic (goran@mips.com)
* Djordje Pesut (djordje@mips.com)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* definitions and initialization of LUT table for FFT
*/
#include "libavutil/thread.h"
#include "libavcodec/fft_table.h"
const int32_t ff_w_tab_sr[MAX_FFT_SIZE/(4*16)] = {
2147483647, 2147483016, 2147481121, 2147477963, 2147473542, 2147467857, 2147460908, 2147452697,
2147443222, 2147432484, 2147420483, 2147407218, 2147392690, 2147376899, 2147359845, 2147341527,
2147321946, 2147301102, 2147278995, 2147255625, 2147230991, 2147205094, 2147177934, 2147149511,
2147119825, 2147088876, 2147056664, 2147023188, 2146988450, 2146952448, 2146915184, 2146876656,
2146836866, 2146795813, 2146753497, 2146709917, 2146665076, 2146618971, 2146571603, 2146522973,
2146473080, 2146421924, 2146369505, 2146315824, 2146260881, 2146204674, 2146147205, 2146088474,
2146028480, 2145967224, 2145904705, 2145840924, 2145775880, 2145709574, 2145642006, 2145573176,
2145503083, 2145431729, 2145359112, 2145285233, 2145210092, 2145133690, 2145056025, 2144977098,
2144896910, 2144815460, 2144732748, 2144648774, 2144563539, 2144477042, 2144389283, 2144300264,
2144209982, 2144118439, 2144025635, 2143931570, 2143836244, 2143739656, 2143641807, 2143542697,
2143442326, 2143340694, 2143237802, 2143133648, 2143028234, 2142921559, 2142813624, 2142704427,
2142593971, 2142482254, 2142369276, 2142255039, 2142139541, 2142022783, 2141904764, 2141785486,
2141664948, 2141543150, 2141420092, 2141295774, 2141170197, 2141043360, 2140915264, 2140785908,
2140655293, 2140523418, 2140390284, 2140255892, 2140120240, 2139983329, 2139845159, 2139705730,
2139565043, 2139423097, 2139279892, 2139135429, 2138989708, 2138842728, 2138694490, 2138544994,
2138394240, 2138242228, 2138088958, 2137934430, 2137778644, 2137621601, 2137463301, 2137303743,
2137142927, 2136980855, 2136817525, 2136652938, 2136487095, 2136319994, 2136151637, 2135982023,
2135811153, 2135639026, 2135465642, 2135291003, 2135115107, 2134937956, 2134759548, 2134579885,
2134398966, 2134216791, 2134033361, 2133848675, 2133662734, 2133475538, 2133287087, 2133097381,
2132906420, 2132714204, 2132520734, 2132326009, 2132130030, 2131932796, 2131734309, 2131534567,
2131333572, 2131131322, 2130927819, 2130723062, 2130517052, 2130309789, 2130101272, 2129891502,
2129680480, 2129468204, 2129254676, 2129039895, 2128823862, 2128606576, 2128388038, 2128168248,
2127947206, 2127724913, 2127501367, 2127276570, 2127050522, 2126823222, 2126594672, 2126364870,
2126133817, 2125901514, 2125667960, 2125433155, 2125197100, 2124959795, 2124721240, 2124481435,
2124240380, 2123998076, 2123754522, 2123509718, 2123263666, 2123016364, 2122767814, 2122518015,
2122266967, 2122014670, 2121761126, 2121506333, 2121250292, 2120993003, 2120734467, 2120474683,
2120213651, 2119951372, 2119687847, 2119423074, 2119157054, 2118889788, 2118621275, 2118351516,
2118080511, 2117808259, 2117534762, 2117260020, 2116984031, 2116706797, 2116428319, 2116148595,
2115867626, 2115585412, 2115301954, 2115017252, 2114731305, 2114444114, 2114155680, 2113866001,
2113575080, 2113282914, 2112989506, 2112694855, 2112398960, 2112101824, 2111803444, 2111503822,
2111202959, 2110900853, 2110597505, 2110292916, 2109987085, 2109680013, 2109371700, 2109062146,
2108751352, 2108439317, 2108126041, 2107811526, 2107495770, 2107178775, 2106860540, 2106541065,
2106220352, 2105898399, 2105575208, 2105250778, 2104925109, 2104598202, 2104270057, 2103940674,
2103610054, 2103278196, 2102945101, 2102610768, 2102275199, 2101938393, 2101600350, 2101261071,
2100920556, 2100578805, 2100235819, 2099891596, 2099546139, 2099199446, 2098851519, 2098502357,
2098151960, 2097800329, 2097447464, 2097093365, 2096738032, 2096381466, 2096023667, 2095664635,
2095304370, 2094942872, 2094580142, 2094216179, 2093850985, 2093484559, 2093116901, 2092748012,
2092377892, 2092006541, 2091633960, 2091260147, 2090885105, 2090508833, 2090131331, 2089752599,
2089372638, 2088991448, 2088609029, 2088225381, 2087840505, 2087454400, 2087067068, 2086678508,
2086288720, 2085897705, 2085505463, 2085111994, 2084717298, 2084321376, 2083924228, 2083525854,
2083126254, 2082725429, 2082323379, 2081920103, 2081515603, 2081109879, 2080702930, 2080294757,
2079885360, 2079474740, 2079062896, 2078649830, 2078235540, 2077820028, 2077403294, 2076985338,
2076566160, 2076145760, 2075724139, 2075301296, 2074877233, 2074451950, 2074025446, 2073597721,
2073168777, 2072738614, 2072307231, 2071874629, 2071440808, 2071005769, 2070569511, 2070132035,
2069693342, 2069253430, 2068812302, 2068369957, 2067926394, 2067481616, 2067035621, 2066588410,
2066139983, 2065690341, 2065239484, 2064787411, 2064334124, 2063879623, 2063423908, 2062966978,
2062508835, 2062049479, 2061588910, 2061127128, 2060664133, 2060199927, 2059734508, 2059267877,
2058800036, 2058330983, 2057860719, 2057389244, 2056916560, 2056442665, 2055967560, 2055491246,
2055013723, 2054534991, 2054055050, 2053573901, 2053091544, 2052607979, 2052123207, 2051637227,
2051150040, 2050661647, 2050172048, 2049681242, 2049189231, 2048696014, 2048201592, 2047705965,
2047209133, 2046711097, 2046211857, 2045711414, 2045209767, 2044706916, 2044202863, 2043697608,
2043191150, 2042683490, 2042174628, 2041664565, 2041153301, 2040640837, 2040127172, 2039612306,
2039096241, 2038578976, 2038060512, 2037540850, 2037019988, 2036497928, 2035974670, 2035450215,
2034924562, 2034397712, 2033869665, 2033340422, 2032809982, 2032278347, 2031745516, 2031211490,
2030676269, 2030139853, 2029602243, 2029063439, 2028523442, 2027982251, 2027439867, 2026896291,
2026351522, 2025805561, 2025258408, 2024710064, 2024160529, 2023609803, 2023057887, 2022504780,
2021950484, 2021394998, 2020838323, 2020280460, 2019721407, 2019161167, 2018599739, 2018037123,
2017473321, 2016908331, 2016342155, 2015774793, 2015206245, 2014636511, 2014065592, 2013493489,
2012920201, 2012345729, 2011770073, 2011193233, 2010615210, 2010036005, 2009455617, 2008874047,
2008291295, 2007707362, 2007122248, 2006535953, 2005948478, 2005359822, 2004769987, 2004178973,
2003586779, 2002993407, 2002398857, 2001803128, 2001206222, 2000608139, 2000008879, 1999408442,
1998806829, 1998204040, 1997600076, 1996994937, 1996388622, 1995781134, 1995172471, 1994562635,
1993951625, 1993339442, 1992726087, 1992111559, 1991495860, 1990878989, 1990260946, 1989641733,
1989021350, 1988399796, 1987777073, 1987153180, 1986528118, 1985901888, 1985274489, 1984645923,
1984016189, 1983385288, 1982753220, 1982119985, 1981485585, 1980850019, 1980213288, 1979575392,
1978936331, 1978296106, 1977654717, 1977012165, 1976368450, 1975723572, 1975077532, 1974430331,
1973781967, 1973132443, 1972481757, 1971829912, 1971176906, 1970522741, 1969867417, 1969210933,
1968553292, 1967894492, 1967234535, 1966573420, 1965911148, 1965247720, 1964583136, 1963917396,
1963250501, 1962582451, 1961913246, 1961242888, 1960571375, 1959898709, 1959224890, 1958549919,
1957873796, 1957196520, 1956518093, 1955838516, 1955157788, 1954475909, 1953792881, 1953108703,
1952423377, 1951736902, 1951049279, 1950360508, 1949670589, 1948979524, 1948287312, 1947593954,
1946899451, 1946203802, 1945507008, 1944809070, 1944109987, 1943409761, 1942708392, 1942005880,
1941302225, 1940597428, 1939891490, 1939184411, 1938476190, 1937766830, 1937056329, 1936344689,
1935631910, 1934917992, 1934202936, 1933486742, 1932769411, 1932050943, 1931331338, 1930610597,
1929888720, 1929165708, 1928441561, 1927716279, 1926989864, 1926262315, 1925533633, 1924803818,
1924072871, 1923340791, 1922607581, 1921873239, 1921137767, 1920401165, 1919663432, 1918924571,
1918184581, 1917443462, 1916701216, 1915957841, 1915213340, 1914467712, 1913720958, 1912973078,
1912224073, 1911473942, 1910722688, 1909970309, 1909216806, 1908462181, 1907706433, 1906949562,
1906191570, 1905432457, 1904672222, 1903910867, 1903148392, 1902384797, 1901620084, 1900854251,
1900087301, 1899319232, 1898550047, 1897779744, 1897008325, 1896235790, 1895462140, 1894687374,
1893911494, 1893134500, 1892356392, 1891577171, 1890796837, 1890015391, 1889232832, 1888449163,
1887664383, 1886878492, 1886091491, 1885303381, 1884514161, 1883723833, 1882932397, 1882139853,
1881346202, 1880551444, 1879755580, 1878958610, 1878160535, 1877361354, 1876561070, 1875759681,
1874957189, 1874153594, 1873348897, 1872543097, 1871736196, 1870928194, 1870119091, 1869308888,
1868497586, 1867685184, 1866871683, 1866057085, 1865241388, 1864424594, 1863606704, 1862787717,
1861967634, 1861146456, 1860324183, 1859500816, 1858676355, 1857850800, 1857024153, 1856196413,
1855367581, 1854537657, 1853706643, 1852874538, 1852041343, 1851207059, 1850371686, 1849535224,
1848697674, 1847859036, 1847019312, 1846178501, 1845336604, 1844493621, 1843649553, 1842804401,
1841958164, 1841110844, 1840262441, 1839412956, 1838562388, 1837710739, 1836858008, 1836004197,
1835149306, 1834293336, 1833436286, 1832578158, 1831718951, 1830858668, 1829997307, 1829134869,
1828271356, 1827406767, 1826541103, 1825674364, 1824806552, 1823937666, 1823067707, 1822196675,
1821324572, 1820451397, 1819577151, 1818701835, 1817825449, 1816947994, 1816069469, 1815189877,
1814309216, 1813427489, 1812544694, 1811660833, 1810775906, 1809889915, 1809002858, 1808114737,
1807225553, 1806335305, 1805443995, 1804551623, 1803658189, 1802763694, 1801868139, 1800971523,
1800073849, 1799175115, 1798275323, 1797374472, 1796472565, 1795569601, 1794665580, 1793760504,
1792854372, 1791947186, 1791038946, 1790129652, 1789219305, 1788307905, 1787395453, 1786481950,
1785567396, 1784651792, 1783735137, 1782817434, 1781898681, 1780978881, 1780058032, 1779136137,
1778213194, 1777289206, 1776364172, 1775438094, 1774510970, 1773582803, 1772653593, 1771723340,
1770792044, 1769859707, 1768926328, 1767991909, 1767056450, 1766119952, 1765182414, 1764243838,
1763304224, 1762363573, 1761421885, 1760479161, 1759535401, 1758590607, 1757644777, 1756697914,
1755750017, 1754801087, 1753851126, 1752900132, 1751948107, 1750995052, 1750040966, 1749085851,
1748129707, 1747172535, 1746214334, 1745255107, 1744294853, 1743333573, 1742371267, 1741407936,
1740443581, 1739478202, 1738511799, 1737544374, 1736575927, 1735606458, 1734635968, 1733664458,
1732691928, 1731718378, 1730743810, 1729768224, 1728791620, 1727813999, 1726835361, 1725855708,
1724875040, 1723893357, 1722910659, 1721926948, 1720942225, 1719956488, 1718969740, 1717981981,
1716993211, 1716003431, 1715012642, 1714020844, 1713028037, 1712034223, 1711039401, 1710043573,
1709046739, 1708048900, 1707050055, 1706050207, 1705049355, 1704047500, 1703044642, 1702040783,
1701035922, 1700030061, 1699023199, 1698015339, 1697006479, 1695996621, 1694985765, 1693973912,
1692961062, 1691947217, 1690932376, 1689916541, 1688899711, 1687881888, 1686863072, 1685843263,
1684822463, 1683800672, 1682777890, 1681754118, 1680729357, 1679703608, 1678676870, 1677649144,
1676620432, 1675590733, 1674560049, 1673528379, 1672495725, 1671462087, 1670427466, 1669391862,
1668355276, 1667317709, 1666279161, 1665239632, 1664199124, 1663157637, 1662115172, 1661071729,
1660027308, 1658981911, 1657935539, 1656888190, 1655839867, 1654790570, 1653740300, 1652689057,
1651636841, 1650583654, 1649529496, 1648474367, 1647418269, 1646361202, 1645303166, 1644244162,
1643184191, 1642123253, 1641061349, 1639998480, 1638934646, 1637869848, 1636804087, 1635737362,
1634669676, 1633601027, 1632531418, 1631460848, 1630389319, 1629316830, 1628243383, 1627168978,
1626093616, 1625017297, 1623940023, 1622861793, 1621782608, 1620702469, 1619621377, 1618539332,
1617456335, 1616372386, 1615287487, 1614201637, 1613114838, 1612027089, 1610938393, 1609848749,
1608758157, 1607666620, 1606574136, 1605480708, 1604386335, 1603291018, 1602194758, 1601097555,
1599999411, 1598900325, 1597800299, 1596699333, 1595597428, 1594494583, 1593390801, 1592286082,
1591180426, 1590073833, 1588966306, 1587857843, 1586748447, 1585638117, 1584526854, 1583414660,
1582301533, 1581187476, 1580072489, 1578956572, 1577839726, 1576721952, 1575603251, 1574483623,
1573363068, 1572241588, 1571119183, 1569995854, 1568871601, 1567746425, 1566620327, 1565493307,
1564365367, 1563236506, 1562106725, 1560976026, 1559844408, 1558711873, 1557578421, 1556444052,
1555308768, 1554172569, 1553035455, 1551897428, 1550758488, 1549618636, 1548477872, 1547336197,
1546193612, 1545050118, 1543905714, 1542760402, 1541614183, 1540467057, 1539319024, 1538170087,
1537020244, 1535869497, 1534717846, 1533565293, 1532411837, 1531257480, 1530102222, 1528946064,
1527789007, 1526631051, 1525472197, 1524312445, 1523151797, 1521990252, 1520827813, 1519664478,
1518500250, 1517335128, 1516169114, 1515002208, 1513834411, 1512665723, 1511496145, 1510325678,
1509154322, 1507982079, 1506808949, 1505634932, 1504460029, 1503284242, 1502107570, 1500930014,
1499751576, 1498572255, 1497392053, 1496210969, 1495029006, 1493846163, 1492662441, 1491477842,
1490292364, 1489106011, 1487918781, 1486730675, 1485541696, 1484351842, 1483161115, 1481969516,
1480777044, 1479583702, 1478389489, 1477194407, 1475998456, 1474801636, 1473603949, 1472405394,
1471205974, 1470005688, 1468804538, 1467602523, 1466399645, 1465195904, 1463991302, 1462785838,
1461579514, 1460372329, 1459164286, 1457955385, 1456745625, 1455535009, 1454323536, 1453111208,
1451898025, 1450683988, 1449469098, 1448253355, 1447036760, 1445819314, 1444601017, 1443381870,
1442161874, 1440941030, 1439719338, 1438496799, 1437273414, 1436049184, 1434824109, 1433598189,
1432371426, 1431143821, 1429915374, 1428686085, 1427455956, 1426224988, 1424993180, 1423760534,
1422527051, 1421292730, 1420057574, 1418821582, 1417584755, 1416347095, 1415108601, 1413869275,
1412629117, 1411388129, 1410146309, 1408903661, 1407660183, 1406415878, 1405170745, 1403924785,
1402678000, 1401430389, 1400181954, 1398932695, 1397682613, 1396431709, 1395179984, 1393927438,
1392674072, 1391419886, 1390164882, 1388909060, 1387652422, 1386394966, 1385136696, 1383877610,
1382617710, 1381356997, 1380095472, 1378833134, 1377569986, 1376306026, 1375041258, 1373775680,
1372509294, 1371242101, 1369974101, 1368705296, 1367435685, 1366165269, 1364894050, 1363622028,
1362349204, 1361075579, 1359801152, 1358525926, 1357249901, 1355973077, 1354695455, 1353417037,
1352137822, 1350857812, 1349577007, 1348295409, 1347013017, 1345729833, 1344445857, 1343161090,
1341875533, 1340589187, 1339302052, 1338014129, 1336725419, 1335435923, 1334145641, 1332854574,
1331562723, 1330270089, 1328976672, 1327682474, 1326387494, 1325091734, 1323795195, 1322497877,
1321199781, 1319900907, 1318601257, 1317300832, 1315999631, 1314697657, 1313394909, 1312091388,
1310787095, 1309482032, 1308176198, 1306869594, 1305562222, 1304254082, 1302945174, 1301635500,
1300325060, 1299013855, 1297701886, 1296389154, 1295075659, 1293761402, 1292446384, 1291130606,
1289814068, 1288496772, 1287178717, 1285859905, 1284540337, 1283220013, 1281898935, 1280577102,
1279254516, 1277931177, 1276607086, 1275282245, 1273956653, 1272630312, 1271303222, 1269975384,
1268646800, 1267317469, 1265987392, 1264656571, 1263325005, 1261992697, 1260659646, 1259325853,
1257991320, 1256656047, 1255320034, 1253983283, 1252645794, 1251307568, 1249968606, 1248628909,
1247288478, 1245947312, 1244605414, 1243262783, 1241919421, 1240575329, 1239230506, 1237884955,
1236538675, 1235191668, 1233843935, 1232495475, 1231146291, 1229796382, 1228445750, 1227094395,
1225742318, 1224389521, 1223036002, 1221681765, 1220326809, 1218971135, 1217614743, 1216257636,
1214899813, 1213541275, 1212182024, 1210822059, 1209461382, 1208099993, 1206737894, 1205375085,
1204011567, 1202647340, 1201282407, 1199916766, 1198550419, 1197183368, 1195815612, 1194447153,
1193077991, 1191708127, 1190337562, 1188966297, 1187594332, 1186221669, 1184848308, 1183474250,
1182099496, 1180724046, 1179347902, 1177971064, 1176593533, 1175215310, 1173836395, 1172456790,
1171076495, 1169695512, 1168313840, 1166931481, 1165548435, 1164164704, 1162780288, 1161395188,
1160009405, 1158622939, 1157235792, 1155847964, 1154459456, 1153070269, 1151680403, 1150289860,
1148898640, 1147506745, 1146114174, 1144720929, 1143327011, 1141932420, 1140537158, 1139141224,
1137744621, 1136347348, 1134949406, 1133550797, 1132151521, 1130751579, 1129350972, 1127949701,
1126547765, 1125145168, 1123741908, 1122337987, 1120933406, 1119528166, 1118122267, 1116715710,
1115308496, 1113900627, 1112492101, 1111082922, 1109673089, 1108262603, 1106851465, 1105439676,
1104027237, 1102614148, 1101200410, 1099786025, 1098370993, 1096955314, 1095538991, 1094122023,
1092704411, 1091286156, 1089867259, 1088447722, 1087027544, 1085606726, 1084185270, 1082763176,
1081340445, 1079917078, 1078493076, 1077068439, 1075643169, 1074217266, 1072790730, 1071363564,
1069935768, 1068507342, 1067078288, 1065648605, 1064218296, 1062787361, 1061355801, 1059923616,
1058490808, 1057057377, 1055623324, 1054188651, 1052753357, 1051317443, 1049880912, 1048443763,
1047005996, 1045567615, 1044128617, 1042689006, 1041248781, 1039807944, 1038366495, 1036924436,
1035481766, 1034038487, 1032594600, 1031150105, 1029705004, 1028259297, 1026812985, 1025366069,
1023918550, 1022470428, 1021021705, 1019572382, 1018122458, 1016671936, 1015220816, 1013769098,
1012316784, 1010863875, 1009410370, 1007956272, 1006501581, 1005046298, 1003590424, 1002133959,
1000676905, 999219262, 997761031, 996302214, 994842810, 993382821, 991922248, 990461091,
988999351, 987537030, 986074127, 984610645, 983146583, 981681943, 980216726, 978750932,
977284562, 975817617, 974350098, 972882006, 971413342, 969944106, 968474300, 967003923,
965532978, 964061465, 962589385, 961116739, 959643527, 958169751, 956695411, 955220508,
953745043, 952269017, 950792431, 949315286, 947837582, 946359321, 944880503, 943401129,
941921200, 940440717, 938959681, 937478092, 935995952, 934513261, 933030021, 931546231,
930061894, 928577010, 927091579, 925605603, 924119082, 922632018, 921144411, 919656262,
918167572, 916678342, 915188572, 913698265, 912207419, 910716038, 909224120, 907731667,
906238681, 904745161, 903251110, 901756526, 900261413, 898765769, 897269597, 895772898,
894275671, 892777918, 891279640, 889780838, 888281512, 886781663, 885281293, 883780402,
882278992, 880777062, 879274614, 877771649, 876268167, 874764170, 873259659, 871754633,
870249095, 868743045, 867236484, 865729413, 864221832, 862713743, 861205147, 859696043,
858186435, 856676321, 855165703, 853654582, 852142959, 850630835, 849118210, 847605086,
846091463, 844577343, 843062726, 841547612, 840032004, 838515901, 836999305, 835482217,
833964638, 832446567, 830928007, 829408958, 827889422, 826369398, 824848888, 823327893,
821806413, 820284450, 818762005, 817239078, 815715670, 814191782, 812667415, 811142571,
809617249, 808091450, 806565177, 805038429, 803511207, 801983513, 800455346, 798926709,
797397602, 795868026, 794337982, 792807470, 791276492, 789745049, 788213141, 786680769,
785147934, 783614638, 782080880, 780546663, 779011986, 777476851, 775941259, 774405210,
772868706, 771331747, 769794334, 768256469, 766718151, 765179382, 763640164, 762100496,
760560380, 759019816, 757478806, 755937350, 754395449, 752853105, 751310318, 749767089,
748223418, 746679308, 745134758, 743589770, 742044345, 740498483, 738952186, 737405453,
735858287, 734310688, 732762657, 731214195, 729665303, 728115982, 726566232, 725016055,
723465451, 721914422, 720362968, 718811090, 717258790, 715706067, 714152924, 712599360,
711045377, 709490976, 707936158, 706380923, 704825272, 703269207, 701712728, 700155836,
698598533, 697040818, 695482694, 693924160, 692365218, 690805869, 689246113, 687685952,
686125387, 684564417, 683003045, 681441272, 679879097, 678316522, 676753549, 675190177,
673626408, 672062243, 670497682, 668932727, 667367379, 665801638, 664235505, 662668981,
661102068, 659534766, 657967075, 656398998, 654830535, 653261686, 651692453, 650122837,
648552838, 646982457, 645411696, 643840556, 642269036, 640697139, 639124865, 637552215,
635979190, 634405791, 632832018, 631257873, 629683357, 628108471, 626533215, 624957590,
623381598, 621805239, 620228514, 618651424, 617073971, 615496154, 613917975, 612339436,
610760536, 609181276, 607601658, 606021683, 604441352, 602860664, 601279623, 599698227,
598116479, 596534378, 594951927, 593369126, 591785976, 590202477, 588618632, 587034440,
585449903, 583865021, 582279796, 580694229, 579108320, 577522070, 575935480, 574348552,
572761285, 571173682, 569585743, 567997469, 566408860, 564819919, 563230645, 561641039,
560051104, 558460839, 556870245, 555279324, 553688076, 552096502, 550504604, 548912382,
547319836, 545726969, 544133781, 542540273, 540946445, 539352300, 537757837, 536163058,
534567963, 532972554, 531376831, 529780796, 528184449, 526587791, 524990824, 523393547,
521795963, 520198072, 518599875, 517001373, 515402566, 513803457, 512204045, 510604332,
509004318, 507404005, 505803394, 504202485, 502601279, 500999778, 499397982, 497795892,
496193509, 494590835, 492987869, 491384614, 489781069, 488177236, 486573117, 484968710,
483364019, 481759043, 480153784, 478548243, 476942419, 475336316, 473729932, 472123270,
470516330, 468909114, 467301622, 465693854, 464085813, 462477499, 460868912, 459260055,
457650927, 456041530, 454431865, 452821933, 451211734, 449601270, 447990541, 446379549,
444768294, 443156777, 441545000, 439932963, 438320667, 436708113, 435095303, 433482236,
431868915, 430255339, 428641511, 427027430, 425413098, 423798515, 422183684, 420568604,
418953276, 417337703, 415721883, 414105819, 412489512, 410872962, 409256170, 407639137,
406021865, 404404353, 402786604, 401168618, 399550396, 397931939, 396313247, 394694323,
393075166, 391455778, 389836160, 388216313, 386596237, 384975934, 383355404, 381734649,
380113669, 378492466, 376871039, 375249392, 373627523, 372005435, 370383128, 368760603,
367137861, 365514903, 363891730, 362268343, 360644742, 359020930, 357396906, 355772673,
354148230, 352523578, 350898719, 349273654, 347648383, 346022908, 344397230, 342771348,
341145265, 339518981, 337892498, 336265816, 334638936, 333011859, 331384586, 329757119,
328129457, 326501602, 324873555, 323245317, 321616889, 319988272, 318359466, 316730474,
315101295, 313471930, 311842381, 310212649, 308582734, 306952638, 305322361, 303691904,
302061269, 300430456, 298799466, 297168301, 295536961, 293905447, 292273760, 290641901,
289009871, 287377671, 285745302, 284112765, 282480061, 280847190, 279214155, 277580955,
275947592, 274314066, 272680379, 271046532, 269412525, 267778360, 266144038, 264509558,
262874923, 261240134, 259605191, 257970095, 256334847, 254699448, 253063900, 251428203,
249792358, 248156366, 246520228, 244883945, 243247518, 241610947, 239974235, 238337382,
236700388, 235063255, 233425984, 231788575, 230151030, 228513350, 226875535, 225237587,
223599506, 221961294, 220322951, 218684479, 217045878, 215407149, 213768293, 212129312,
210490206, 208850976, 207211624, 205572149, 203932553, 202292838, 200653003, 199013051,
197372981, 195732795, 194092495, 192452080, 190811551, 189170911, 187530159, 185889297,
184248325, 182607245, 180966058, 179324764, 177683365, 176041861, 174400254, 172758544,
171116733, 169474820, 167832808, 166190698, 164548489, 162906184, 161263783, 159621287,
157978697, 156336015, 154693240, 153050374, 151407418, 149764374, 148121241, 146478021,
144834714, 143191323, 141547847, 139904288, 138260647, 136616925, 134973122, 133329239,
131685278, 130041240, 128397125, 126752935, 125108670, 123464332, 121819921, 120175438,
118530885, 116886262, 115241570, 113596810, 111951983, 110307091, 108662134, 107017112,
105372028, 103726882, 102081675, 100436408, 98791081, 97145697, 95500255, 93854758,
92209205, 90563597, 88917937, 87272224, 85626460, 83980645, 82334782, 80688869,
79042909, 77396903, 75750851, 74104755, 72458615, 70812432, 69166208, 67519943,
65873638, 64227295, 62580914, 60934496, 59288042, 57641553, 55995030, 54348475,
52701887, 51055268, 49408620, 47761942, 46115236, 44468503, 42821744, 41174960,
39528151, 37881320, 36234466, 34587590, 32940695, 31293780, 29646846, 27999895,
26352928, 24705945, 23058947, 21411936, 19764913, 18117878, 16470832, 14823776,
13176712, 11529640, 9882561, 8235476, 6588387, 4941294, 3294197, 1647099
};
uint16_t ff_fft_offsets_lut[21845];
static void fft_lut_init(uint16_t *table, int off, int size, int *index)
{
if (size < 16) {
table[*index] = off >> 2;
(*index)++;
}
else {
fft_lut_init(table, off, size >> 1, index);
fft_lut_init(table, off + (size >> 1), size >> 2, index);
fft_lut_init(table, off + 3 * (size >> 2), size >> 2, index);
}
}
static void fft_lut_init_start(void)
{
int n = 0;
fft_lut_init(ff_fft_offsets_lut, 0, 1 << 17, &n);
}
void ff_fft_lut_init(void)
{
static AVOnce init_once = AV_ONCE_INIT;
ff_thread_once(&init_once, fft_lut_init_start);
}

View File

@ -1,66 +0,0 @@
/*
* Copyright (c) 2012
* MIPS Technologies, Inc., California.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* Authors: Stanislav Ocovaj (socovaj@mips.com)
* Goran Cordasic (goran@mips.com)
* Djordje Pesut (djordje@mips.com)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* definitions and tables for FFT
*/
#ifndef AVCODEC_FFT_TABLE_H
#define AVCODEC_FFT_TABLE_H
#include "libavcodec/fft.h"
#define MAX_LOG2_NFFT 17 //!< Specifies maximum allowed fft size
#define MAX_FFT_SIZE (1 << MAX_LOG2_NFFT)
extern const int32_t ff_w_tab_sr[];
extern uint16_t ff_fft_offsets_lut[];
void ff_fft_lut_init(void);
#endif /* AVCODEC_FFT_TABLE_H */

View File

@ -1,628 +0,0 @@
/*
* FFT/IFFT transforms
* Copyright (c) 2008 Loren Merritt
* Copyright (c) 2002 Fabrice Bellard
* Partly based on libdjbfft by D. J. Bernstein
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* FFT/IFFT transforms.
*/
#include <stdlib.h>
#include <string.h>
#include "libavutil/mathematics.h"
#include "libavutil/thread.h"
#include "fft.h"
#include "fft-internal.h"
#if !FFT_FLOAT
#include "fft_table.h"
#else /* !FFT_FLOAT */
/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
#if !CONFIG_HARDCODED_TABLES
COSTABLE(16);
COSTABLE(32);
COSTABLE(64);
COSTABLE(128);
COSTABLE(256);
COSTABLE(512);
COSTABLE(1024);
COSTABLE(2048);
COSTABLE(4096);
COSTABLE(8192);
COSTABLE(16384);
COSTABLE(32768);
COSTABLE(65536);
COSTABLE(131072);
static av_cold void init_ff_cos_tabs(int index)
{
int i;
int m = 1<<index;
double freq = 2*M_PI/m;
FFTSample *tab = FFT_NAME(ff_cos_tabs)[index];
for(i=0; i<=m/4; i++)
tab[i] = FIX15(cos(i*freq));
for(i=1; i<m/4; i++)
tab[m/2-i] = tab[i];
}
typedef struct CosTabsInitOnce {
void (*func)(void);
AVOnce control;
} CosTabsInitOnce;
#define INIT_FF_COS_TABS_FUNC(index, size) \
static av_cold void init_ff_cos_tabs_ ## size (void)\
{ \
init_ff_cos_tabs(index); \
}
INIT_FF_COS_TABS_FUNC(4, 16)
INIT_FF_COS_TABS_FUNC(5, 32)
INIT_FF_COS_TABS_FUNC(6, 64)
INIT_FF_COS_TABS_FUNC(7, 128)
INIT_FF_COS_TABS_FUNC(8, 256)
INIT_FF_COS_TABS_FUNC(9, 512)
INIT_FF_COS_TABS_FUNC(10, 1024)
INIT_FF_COS_TABS_FUNC(11, 2048)
INIT_FF_COS_TABS_FUNC(12, 4096)
INIT_FF_COS_TABS_FUNC(13, 8192)
INIT_FF_COS_TABS_FUNC(14, 16384)
INIT_FF_COS_TABS_FUNC(15, 32768)
INIT_FF_COS_TABS_FUNC(16, 65536)
INIT_FF_COS_TABS_FUNC(17, 131072)
static CosTabsInitOnce cos_tabs_init_once[] = {
{ NULL },
{ NULL },
{ NULL },
{ NULL },
{ init_ff_cos_tabs_16, AV_ONCE_INIT },
{ init_ff_cos_tabs_32, AV_ONCE_INIT },
{ init_ff_cos_tabs_64, AV_ONCE_INIT },
{ init_ff_cos_tabs_128, AV_ONCE_INIT },
{ init_ff_cos_tabs_256, AV_ONCE_INIT },
{ init_ff_cos_tabs_512, AV_ONCE_INIT },
{ init_ff_cos_tabs_1024, AV_ONCE_INIT },
{ init_ff_cos_tabs_2048, AV_ONCE_INIT },
{ init_ff_cos_tabs_4096, AV_ONCE_INIT },
{ init_ff_cos_tabs_8192, AV_ONCE_INIT },
{ init_ff_cos_tabs_16384, AV_ONCE_INIT },
{ init_ff_cos_tabs_32768, AV_ONCE_INIT },
{ init_ff_cos_tabs_65536, AV_ONCE_INIT },
{ init_ff_cos_tabs_131072, AV_ONCE_INIT },
};
av_cold void ff_init_ff_cos_tabs(int index)
{
ff_thread_once(&cos_tabs_init_once[index].control, cos_tabs_init_once[index].func);
}
#endif
COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = {
NULL, NULL, NULL, NULL,
FFT_NAME(ff_cos_16),
FFT_NAME(ff_cos_32),
FFT_NAME(ff_cos_64),
FFT_NAME(ff_cos_128),
FFT_NAME(ff_cos_256),
FFT_NAME(ff_cos_512),
FFT_NAME(ff_cos_1024),
FFT_NAME(ff_cos_2048),
FFT_NAME(ff_cos_4096),
FFT_NAME(ff_cos_8192),
FFT_NAME(ff_cos_16384),
FFT_NAME(ff_cos_32768),
FFT_NAME(ff_cos_65536),
FFT_NAME(ff_cos_131072),
};
#endif /* FFT_FLOAT */
static void fft_permute_c(FFTContext *s, FFTComplex *z);
static void fft_calc_c(FFTContext *s, FFTComplex *z);
static int split_radix_permutation(int i, int n, int inverse)
{
int m;
if(n <= 2) return i&1;
m = n >> 1;
if(!(i&m)) return split_radix_permutation(i, m, inverse)*2;
m >>= 1;
if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1;
else return split_radix_permutation(i, m, inverse)*4 - 1;
}
static const int avx_tab[] = {
0, 4, 1, 5, 8, 12, 9, 13, 2, 6, 3, 7, 10, 14, 11, 15
};
static int is_second_half_of_fft32(int i, int n)
{
if (n <= 32)
return i >= 16;
else if (i < n/2)
return is_second_half_of_fft32(i, n/2);
else if (i < 3*n/4)
return is_second_half_of_fft32(i - n/2, n/4);
else
return is_second_half_of_fft32(i - 3*n/4, n/4);
}
static av_cold void fft_perm_avx(FFTContext *s)
{
int i;
int n = 1 << s->nbits;
for (i = 0; i < n; i += 16) {
int k;
if (is_second_half_of_fft32(i, n)) {
for (k = 0; k < 16; k++)
s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] =
i + avx_tab[k];
} else {
for (k = 0; k < 16; k++) {
int j = i + k;
j = (j & ~7) | ((j >> 1) & 3) | ((j << 2) & 4);
s->revtab[-split_radix_permutation(i + k, n, s->inverse) & (n - 1)] = j;
}
}
}
}
av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
{
int i, j, n;
s->revtab = NULL;
s->revtab32 = NULL;
if (nbits < 2 || nbits > 17)
goto fail;
s->nbits = nbits;
n = 1 << nbits;
if (nbits <= 16) {
s->revtab = av_malloc(n * sizeof(uint16_t));
if (!s->revtab)
goto fail;
} else {
s->revtab32 = av_malloc(n * sizeof(uint32_t));
if (!s->revtab32)
goto fail;
}
s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
if (!s->tmp_buf)
goto fail;
s->inverse = inverse;
s->fft_permutation = FF_FFT_PERM_DEFAULT;
s->fft_permute = fft_permute_c;
s->fft_calc = fft_calc_c;
#if CONFIG_MDCT
s->imdct_calc = ff_imdct_calc_c;
s->imdct_half = ff_imdct_half_c;
s->mdct_calc = ff_mdct_calc_c;
#endif
#if FFT_FLOAT
#if ARCH_AARCH64
ff_fft_init_aarch64(s);
#elif ARCH_ARM
ff_fft_init_arm(s);
#elif ARCH_PPC
ff_fft_init_ppc(s);
#elif ARCH_X86
ff_fft_init_x86(s);
#endif
#if HAVE_MIPSFPU
ff_fft_init_mips(s);
#endif
for(j=4; j<=nbits; j++) {
ff_init_ff_cos_tabs(j);
}
#else /* FFT_FLOAT */
ff_fft_lut_init();
#endif
if (ARCH_X86 && FFT_FLOAT && s->fft_permutation == FF_FFT_PERM_AVX) {
fft_perm_avx(s);
} else {
#define PROCESS_FFT_PERM_SWAP_LSBS(num) do {\
for(i = 0; i < n; i++) {\
int k;\
j = i;\
j = (j & ~3) | ((j >> 1) & 1) | ((j << 1) & 2);\
k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\
s->revtab##num[k] = j;\
} \
} while(0);
#define PROCESS_FFT_PERM_DEFAULT(num) do {\
for(i = 0; i < n; i++) {\
int k;\
j = i;\
k = -split_radix_permutation(i, n, s->inverse) & (n - 1);\
s->revtab##num[k] = j;\
} \
} while(0);
#define SPLIT_RADIX_PERMUTATION(num) do { \
if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS) {\
PROCESS_FFT_PERM_SWAP_LSBS(num) \
} else {\
PROCESS_FFT_PERM_DEFAULT(num) \
}\
} while(0);
if (s->revtab)
SPLIT_RADIX_PERMUTATION()
if (s->revtab32)
SPLIT_RADIX_PERMUTATION(32)
#undef PROCESS_FFT_PERM_DEFAULT
#undef PROCESS_FFT_PERM_SWAP_LSBS
#undef SPLIT_RADIX_PERMUTATION
}
return 0;
fail:
av_freep(&s->revtab);
av_freep(&s->revtab32);
av_freep(&s->tmp_buf);
return -1;
}
static void fft_permute_c(FFTContext *s, FFTComplex *z)
{
int j, np;
const uint16_t *revtab = s->revtab;
const uint32_t *revtab32 = s->revtab32;
np = 1 << s->nbits;
/* TODO: handle split-radix permute in a more optimal way, probably in-place */
if (revtab) {
for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
} else
for(j=0;j<np;j++) s->tmp_buf[revtab32[j]] = z[j];
memcpy(z, s->tmp_buf, np * sizeof(FFTComplex));
}
av_cold void ff_fft_end(FFTContext *s)
{
av_freep(&s->revtab);
av_freep(&s->revtab32);
av_freep(&s->tmp_buf);
}
#if !FFT_FLOAT
static void fft_calc_c(FFTContext *s, FFTComplex *z) {
int nbits, i, n, num_transforms, offset, step;
int n4, n2, n34;
unsigned tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
FFTComplex *tmpz;
const int fft_size = (1 << s->nbits);
int64_t accu;
num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
for (n=0; n<num_transforms; n++){
offset = ff_fft_offsets_lut[n] << 2;
tmpz = z + offset;
tmp1 = tmpz[0].re + (unsigned)tmpz[1].re;
tmp5 = tmpz[2].re + (unsigned)tmpz[3].re;
tmp2 = tmpz[0].im + (unsigned)tmpz[1].im;
tmp6 = tmpz[2].im + (unsigned)tmpz[3].im;
tmp3 = tmpz[0].re - (unsigned)tmpz[1].re;
tmp8 = tmpz[2].im - (unsigned)tmpz[3].im;
tmp4 = tmpz[0].im - (unsigned)tmpz[1].im;
tmp7 = tmpz[2].re - (unsigned)tmpz[3].re;
tmpz[0].re = tmp1 + tmp5;
tmpz[2].re = tmp1 - tmp5;
tmpz[0].im = tmp2 + tmp6;
tmpz[2].im = tmp2 - tmp6;
tmpz[1].re = tmp3 + tmp8;
tmpz[3].re = tmp3 - tmp8;
tmpz[1].im = tmp4 - tmp7;
tmpz[3].im = tmp4 + tmp7;
}
if (fft_size < 8)
return;
num_transforms = (num_transforms >> 1) | 1;
for (n=0; n<num_transforms; n++){
offset = ff_fft_offsets_lut[n] << 3;
tmpz = z + offset;
tmp1 = tmpz[4].re + (unsigned)tmpz[5].re;
tmp3 = tmpz[6].re + (unsigned)tmpz[7].re;
tmp2 = tmpz[4].im + (unsigned)tmpz[5].im;
tmp4 = tmpz[6].im + (unsigned)tmpz[7].im;
tmp5 = tmp1 + tmp3;
tmp7 = tmp1 - tmp3;
tmp6 = tmp2 + tmp4;
tmp8 = tmp2 - tmp4;
tmp1 = tmpz[4].re - (unsigned)tmpz[5].re;
tmp2 = tmpz[4].im - (unsigned)tmpz[5].im;
tmp3 = tmpz[6].re - (unsigned)tmpz[7].re;
tmp4 = tmpz[6].im - (unsigned)tmpz[7].im;
tmpz[4].re = tmpz[0].re - tmp5;
tmpz[0].re = tmpz[0].re + tmp5;
tmpz[4].im = tmpz[0].im - tmp6;
tmpz[0].im = tmpz[0].im + tmp6;
tmpz[6].re = tmpz[2].re - tmp8;
tmpz[2].re = tmpz[2].re + tmp8;
tmpz[6].im = tmpz[2].im + tmp7;
tmpz[2].im = tmpz[2].im - tmp7;
accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp1 + tmp2);
tmp5 = (int32_t)((accu + 0x40000000) >> 31);
accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 - tmp4);
tmp7 = (int32_t)((accu + 0x40000000) >> 31);
accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp2 - tmp1);
tmp6 = (int32_t)((accu + 0x40000000) >> 31);
accu = (int64_t)Q31(M_SQRT1_2)*(int)(tmp3 + tmp4);
tmp8 = (int32_t)((accu + 0x40000000) >> 31);
tmp1 = tmp5 + tmp7;
tmp3 = tmp5 - tmp7;
tmp2 = tmp6 + tmp8;
tmp4 = tmp6 - tmp8;
tmpz[5].re = tmpz[1].re - tmp1;
tmpz[1].re = tmpz[1].re + tmp1;
tmpz[5].im = tmpz[1].im - tmp2;
tmpz[1].im = tmpz[1].im + tmp2;
tmpz[7].re = tmpz[3].re - tmp4;
tmpz[3].re = tmpz[3].re + tmp4;
tmpz[7].im = tmpz[3].im + tmp3;
tmpz[3].im = tmpz[3].im - tmp3;
}
step = 1 << ((MAX_LOG2_NFFT-4) - 4);
n4 = 4;
for (nbits=4; nbits<=s->nbits; nbits++){
n2 = 2*n4;
n34 = 3*n4;
num_transforms = (num_transforms >> 1) | 1;
for (n=0; n<num_transforms; n++){
const FFTSample *w_re_ptr = ff_w_tab_sr + step;
const FFTSample *w_im_ptr = ff_w_tab_sr + MAX_FFT_SIZE/(4*16) - step;
offset = ff_fft_offsets_lut[n] << nbits;
tmpz = z + offset;
tmp5 = tmpz[ n2].re + (unsigned)tmpz[n34].re;
tmp1 = tmpz[ n2].re - (unsigned)tmpz[n34].re;
tmp6 = tmpz[ n2].im + (unsigned)tmpz[n34].im;
tmp2 = tmpz[ n2].im - (unsigned)tmpz[n34].im;
tmpz[ n2].re = tmpz[ 0].re - tmp5;
tmpz[ 0].re = tmpz[ 0].re + tmp5;
tmpz[ n2].im = tmpz[ 0].im - tmp6;
tmpz[ 0].im = tmpz[ 0].im + tmp6;
tmpz[n34].re = tmpz[n4].re - tmp2;
tmpz[ n4].re = tmpz[n4].re + tmp2;
tmpz[n34].im = tmpz[n4].im + tmp1;
tmpz[ n4].im = tmpz[n4].im - tmp1;
for (i=1; i<n4; i++){
FFTSample w_re = w_re_ptr[0];
FFTSample w_im = w_im_ptr[0];
accu = (int64_t)w_re*tmpz[ n2+i].re;
accu += (int64_t)w_im*tmpz[ n2+i].im;
tmp1 = (int32_t)((accu + 0x40000000) >> 31);
accu = (int64_t)w_re*tmpz[ n2+i].im;
accu -= (int64_t)w_im*tmpz[ n2+i].re;
tmp2 = (int32_t)((accu + 0x40000000) >> 31);
accu = (int64_t)w_re*tmpz[n34+i].re;
accu -= (int64_t)w_im*tmpz[n34+i].im;
tmp3 = (int32_t)((accu + 0x40000000) >> 31);
accu = (int64_t)w_re*tmpz[n34+i].im;
accu += (int64_t)w_im*tmpz[n34+i].re;
tmp4 = (int32_t)((accu + 0x40000000) >> 31);
tmp5 = tmp1 + tmp3;
tmp1 = tmp1 - tmp3;
tmp6 = tmp2 + tmp4;
tmp2 = tmp2 - tmp4;
tmpz[ n2+i].re = tmpz[ i].re - tmp5;
tmpz[ i].re = tmpz[ i].re + tmp5;
tmpz[ n2+i].im = tmpz[ i].im - tmp6;
tmpz[ i].im = tmpz[ i].im + tmp6;
tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
w_re_ptr += step;
w_im_ptr -= step;
}
}
step >>= 1;
n4 <<= 1;
}
}
#else /* !FFT_FLOAT */
#define BUTTERFLIES(a0,a1,a2,a3) {\
BF(t3, t5, t5, t1);\
BF(a2.re, a0.re, a0.re, t5);\
BF(a3.im, a1.im, a1.im, t3);\
BF(t4, t6, t2, t6);\
BF(a3.re, a1.re, a1.re, t4);\
BF(a2.im, a0.im, a0.im, t6);\
}
// force loading all the inputs before storing any.
// this is slightly slower for small data, but avoids store->load aliasing
// for addresses separated by large powers of 2.
#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\
FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\
BF(t3, t5, t5, t1);\
BF(a2.re, a0.re, r0, t5);\
BF(a3.im, a1.im, i1, t3);\
BF(t4, t6, t2, t6);\
BF(a3.re, a1.re, r1, t4);\
BF(a2.im, a0.im, i0, t6);\
}
#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\
CMUL(t1, t2, a2.re, a2.im, wre, -wim);\
CMUL(t5, t6, a3.re, a3.im, wre, wim);\
BUTTERFLIES(a0,a1,a2,a3)\
}
#define TRANSFORM_ZERO(a0,a1,a2,a3) {\
t1 = a2.re;\
t2 = a2.im;\
t5 = a3.re;\
t6 = a3.im;\
BUTTERFLIES(a0,a1,a2,a3)\
}
/* z[0...8n-1], w[1...2n-1] */
#define PASS(name)\
static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\
{\
FFTDouble t1, t2, t3, t4, t5, t6;\
int o1 = 2*n;\
int o2 = 4*n;\
int o3 = 6*n;\
const FFTSample *wim = wre+o1;\
n--;\
\
TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\
TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
do {\
z += 2;\
wre += 2;\
wim -= 2;\
TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\
TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
} while(--n);\
}
PASS(pass)
#if !CONFIG_SMALL
#undef BUTTERFLIES
#define BUTTERFLIES BUTTERFLIES_BIG
PASS(pass_big)
#endif
#define DECL_FFT(n,n2,n4)\
static void fft##n(FFTComplex *z)\
{\
fft##n2(z);\
fft##n4(z+n4*2);\
fft##n4(z+n4*3);\
pass(z,FFT_NAME(ff_cos_##n),n4/2);\
}
static void fft4(FFTComplex *z)
{
FFTDouble t1, t2, t3, t4, t5, t6, t7, t8;
BF(t3, t1, z[0].re, z[1].re);
BF(t8, t6, z[3].re, z[2].re);
BF(z[2].re, z[0].re, t1, t6);
BF(t4, t2, z[0].im, z[1].im);
BF(t7, t5, z[2].im, z[3].im);
BF(z[3].im, z[1].im, t4, t8);
BF(z[3].re, z[1].re, t3, t7);
BF(z[2].im, z[0].im, t2, t5);
}
static void fft8(FFTComplex *z)
{
FFTDouble t1, t2, t3, t4, t5, t6;
fft4(z);
BF(t1, z[5].re, z[4].re, -z[5].re);
BF(t2, z[5].im, z[4].im, -z[5].im);
BF(t5, z[7].re, z[6].re, -z[7].re);
BF(t6, z[7].im, z[6].im, -z[7].im);
BUTTERFLIES(z[0],z[2],z[4],z[6]);
TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf);
}
#if !CONFIG_SMALL
static void fft16(FFTComplex *z)
{
FFTDouble t1, t2, t3, t4, t5, t6;
FFTSample cos_16_1 = FFT_NAME(ff_cos_16)[1];
FFTSample cos_16_3 = FFT_NAME(ff_cos_16)[3];
fft8(z);
fft4(z+8);
fft4(z+12);
TRANSFORM_ZERO(z[0],z[4],z[8],z[12]);
TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf);
TRANSFORM(z[1],z[5],z[9],z[13],cos_16_1,cos_16_3);
TRANSFORM(z[3],z[7],z[11],z[15],cos_16_3,cos_16_1);
}
#else
DECL_FFT(16,8,4)
#endif
DECL_FFT(32,16,8)
DECL_FFT(64,32,16)
DECL_FFT(128,64,32)
DECL_FFT(256,128,64)
DECL_FFT(512,256,128)
#if !CONFIG_SMALL
#define pass pass_big
#endif
DECL_FFT(1024,512,256)
DECL_FFT(2048,1024,512)
DECL_FFT(4096,2048,1024)
DECL_FFT(8192,4096,2048)
DECL_FFT(16384,8192,4096)
DECL_FFT(32768,16384,8192)
DECL_FFT(65536,32768,16384)
DECL_FFT(131072,65536,32768)
static void (* const fft_dispatch[])(FFTComplex*) = {
fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024,
fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, fft131072
};
static void fft_calc_c(FFTContext *s, FFTComplex *z)
{
fft_dispatch[s->nbits-2](z);
}
#endif /* !FFT_FLOAT */

View File

@ -1,51 +0,0 @@
/*
* Copyright (c) 2012
* MIPS Technologies, Inc., California.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* Authors: Stanislav Ocovaj (socovaj@mips.com)
* Goran Cordasic (goran@mips.com)
* Djordje Pesut (djordje@mips.com)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define FFT_FLOAT 0
#include "mdct_template.c"

View File

@ -1,20 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define FFT_FLOAT 1
#include "mdct_template.c"

View File

@ -1,209 +0,0 @@
/*
* MDCT/IMDCT transforms
* Copyright (c) 2002 Fabrice Bellard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdlib.h>
#include <string.h>
#include "libavutil/common.h"
#include "libavutil/libm.h"
#include "libavutil/mathematics.h"
#include "fft.h"
#include "fft-internal.h"
/**
* @file
* MDCT/IMDCT transforms.
*/
#if FFT_FLOAT
# define RSCALE(x, y) ((x) + (y))
#else
# define RSCALE(x, y) ((int)((x) + (unsigned)(y) + 32) >> 6)
#endif
/**
* init MDCT or IMDCT computation.
*/
av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale)
{
int n, n4, i;
double alpha, theta;
int tstep;
memset(s, 0, sizeof(*s));
n = 1 << nbits;
s->mdct_bits = nbits;
s->mdct_size = n;
n4 = n >> 2;
s->mdct_permutation = FF_MDCT_PERM_NONE;
if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0)
goto fail;
s->tcos = av_malloc_array(n/2, sizeof(FFTSample));
if (!s->tcos)
goto fail;
switch (s->mdct_permutation) {
case FF_MDCT_PERM_NONE:
s->tsin = s->tcos + n4;
tstep = 1;
break;
case FF_MDCT_PERM_INTERLEAVE:
s->tsin = s->tcos + 1;
tstep = 2;
break;
default:
goto fail;
}
theta = 1.0 / 8.0 + (scale < 0 ? n4 : 0);
scale = sqrt(fabs(scale));
for(i=0;i<n4;i++) {
alpha = 2 * M_PI * (i + theta) / n;
#if !FFT_FLOAT
s->tcos[i*tstep] = lrint(-cos(alpha) * 2147483648.0);
s->tsin[i*tstep] = lrint(-sin(alpha) * 2147483648.0);
#else
s->tcos[i*tstep] = FIX15(-cos(alpha) * scale);
s->tsin[i*tstep] = FIX15(-sin(alpha) * scale);
#endif
}
return 0;
fail:
ff_mdct_end(s);
return -1;
}
/**
* Compute the middle half of the inverse MDCT of size N = 2^nbits,
* thus excluding the parts that can be derived by symmetry
* @param output N/2 samples
* @param input N/2 samples
*/
void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int k, n8, n4, n2, n, j;
const uint16_t *revtab = s->revtab;
const FFTSample *tcos = s->tcos;
const FFTSample *tsin = s->tsin;
const FFTSample *in1, *in2;
FFTComplex *z = (FFTComplex *)output;
n = 1 << s->mdct_bits;
n2 = n >> 1;
n4 = n >> 2;
n8 = n >> 3;
/* pre rotation */
in1 = input;
in2 = input + n2 - 1;
for(k = 0; k < n4; k++) {
j=revtab[k];
CMUL(z[j].re, z[j].im, *in2, *in1, tcos[k], tsin[k]);
in1 += 2;
in2 -= 2;
}
s->fft_calc(s, z);
/* post rotation + reordering */
for(k = 0; k < n8; k++) {
FFTSample r0, i0, r1, i1;
CMUL(r0, i1, z[n8-k-1].im, z[n8-k-1].re, tsin[n8-k-1], tcos[n8-k-1]);
CMUL(r1, i0, z[n8+k ].im, z[n8+k ].re, tsin[n8+k ], tcos[n8+k ]);
z[n8-k-1].re = r0;
z[n8-k-1].im = i0;
z[n8+k ].re = r1;
z[n8+k ].im = i1;
}
}
/**
* Compute inverse MDCT of size N = 2^nbits
* @param output N samples
* @param input N/2 samples
*/
void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int k;
int n = 1 << s->mdct_bits;
int n2 = n >> 1;
int n4 = n >> 2;
ff_imdct_half_c(s, output+n4, input);
for(k = 0; k < n4; k++) {
output[k] = -output[n2-k-1];
output[n-k-1] = output[n2+k];
}
}
/**
* Compute MDCT of size N = 2^nbits
* @param input N samples
* @param out N/2 samples
*/
void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
{
int i, j, n, n8, n4, n2, n3;
FFTDouble re, im;
const uint16_t *revtab = s->revtab;
const FFTSample *tcos = s->tcos;
const FFTSample *tsin = s->tsin;
FFTComplex *x = (FFTComplex *)out;
n = 1 << s->mdct_bits;
n2 = n >> 1;
n4 = n >> 2;
n8 = n >> 3;
n3 = 3 * n4;
/* pre rotation */
for(i=0;i<n8;i++) {
re = RSCALE(-input[2*i+n3], - input[n3-1-2*i]);
im = RSCALE(-input[n4+2*i], + input[n4-1-2*i]);
j = revtab[i];
CMUL(x[j].re, x[j].im, re, im, -tcos[i], tsin[i]);
re = RSCALE( input[2*i] , - input[n2-1-2*i]);
im = RSCALE(-input[n2+2*i], - input[ n-1-2*i]);
j = revtab[n8 + i];
CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
}
s->fft_calc(s, x);
/* post rotation */
for(i=0;i<n8;i++) {
FFTSample r0, i0, r1, i1;
CMUL(i1, r0, x[n8-i-1].re, x[n8-i-1].im, -tsin[n8-i-1], -tcos[n8-i-1]);
CMUL(i0, r1, x[n8+i ].re, x[n8+i ].im, -tsin[n8+i ], -tcos[n8+i ]);
x[n8-i-1].re = r0;
x[n8-i-1].im = i0;
x[n8+i ].re = r1;
x[n8+i ].im = i1;
}
}
av_cold void ff_mdct_end(FFTContext *s)
{
av_freep(&s->tcos);
ff_fft_end(s);
}

View File

@ -13,7 +13,6 @@ MIPSFPU-OBJS-$(CONFIG_AMRWB_DECODER) += mips/acelp_filters_mips.o \
mips/acelp_vectors_mips.o
MIPSFPU-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_float.o
MIPSDSP-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_fixed.o
MIPSFPU-OBJS-$(CONFIG_FFT) += mips/fft_mips.o
MIPSFPU-OBJS-$(CONFIG_FMTCONVERT) += mips/fmtconvert_mips.o
OBJS-$(CONFIG_AC3DSP) += mips/ac3dsp_mips.o
OBJS-$(CONFIG_AAC_DECODER) += mips/aacdec_mips.o \

View File

@ -1,516 +0,0 @@
/*
* Copyright (c) 2012
* MIPS Technologies, Inc., California.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* Author: Stanislav Ocovaj (socovaj@mips.com)
* Author: Zoran Lukic (zoranl@mips.com)
*
* Optimized MDCT/IMDCT and FFT transforms
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavcodec/fft.h"
#include "libavcodec/fft_table.h"
#include "libavutil/mips/asmdefs.h"
/**
* FFT transform
*/
#if HAVE_INLINE_ASM
#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
{
int nbits, i, n, num_transforms, offset, step;
int n4, n2, n34;
FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
FFTComplex *tmpz;
float w_re, w_im;
float *w_re_ptr, *w_im_ptr;
const int fft_size = (1 << s->nbits);
float pom, pom1, pom2, pom3;
float temp, temp1, temp3, temp4;
FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
float f1 = 0.7071067812;
num_transforms = (21845 >> (17 - s->nbits)) | 1;
for (n=0; n<num_transforms; n++) {
offset = ff_fft_offsets_lut[n] << 2;
tmpz = z + offset;
tmp1 = tmpz[0].re + tmpz[1].re;
tmp5 = tmpz[2].re + tmpz[3].re;
tmp2 = tmpz[0].im + tmpz[1].im;
tmp6 = tmpz[2].im + tmpz[3].im;
tmp3 = tmpz[0].re - tmpz[1].re;
tmp8 = tmpz[2].im - tmpz[3].im;
tmp4 = tmpz[0].im - tmpz[1].im;
tmp7 = tmpz[2].re - tmpz[3].re;
tmpz[0].re = tmp1 + tmp5;
tmpz[2].re = tmp1 - tmp5;
tmpz[0].im = tmp2 + tmp6;
tmpz[2].im = tmp2 - tmp6;
tmpz[1].re = tmp3 + tmp8;
tmpz[3].re = tmp3 - tmp8;
tmpz[1].im = tmp4 - tmp7;
tmpz[3].im = tmp4 + tmp7;
}
if (fft_size < 8)
return;
num_transforms = (num_transforms >> 1) | 1;
for (n=0; n<num_transforms; n++) {
offset = ff_fft_offsets_lut[n] << 3;
tmpz = z + offset;
__asm__ volatile (
"lwc1 %[tmp1], 32(%[tmpz]) \n\t"
"lwc1 %[pom], 40(%[tmpz]) \n\t"
"lwc1 %[tmp3], 48(%[tmpz]) \n\t"
"lwc1 %[pom1], 56(%[tmpz]) \n\t"
"lwc1 %[tmp2], 36(%[tmpz]) \n\t"
"lwc1 %[pom2], 44(%[tmpz]) \n\t"
"lwc1 %[pom3], 60(%[tmpz]) \n\t"
"lwc1 %[tmp4], 52(%[tmpz]) \n\t"
"add.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re + tmpz[5].re;
"add.s %[tmp3], %[tmp3], %[pom1] \n\t" // tmp3 = tmpz[6].re + tmpz[7].re;
"add.s %[tmp2], %[tmp2], %[pom2] \n\t" // tmp2 = tmpz[4].im + tmpz[5].im;
"lwc1 %[pom], 40(%[tmpz]) \n\t"
"add.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im + tmpz[7].im;
"add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
"sub.s %[tmp7], %[tmp1], %[tmp3] \n\t" // tmp7 = tmp1 - tmp3;
"lwc1 %[tmp1], 32(%[tmpz]) \n\t"
"lwc1 %[pom1], 44(%[tmpz]) \n\t"
"add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
"sub.s %[tmp8], %[tmp2], %[tmp4] \n\t" // tmp8 = tmp2 - tmp4;
"lwc1 %[tmp2], 36(%[tmpz]) \n\t"
"lwc1 %[pom2], 56(%[tmpz]) \n\t"
"lwc1 %[pom3], 60(%[tmpz]) \n\t"
"lwc1 %[tmp3], 48(%[tmpz]) \n\t"
"lwc1 %[tmp4], 52(%[tmpz]) \n\t"
"sub.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re - tmpz[5].re;
"lwc1 %[pom], 0(%[tmpz]) \n\t"
"sub.s %[tmp2], %[tmp2], %[pom1] \n\t" // tmp2 = tmpz[4].im - tmpz[5].im;
"sub.s %[tmp3], %[tmp3], %[pom2] \n\t" // tmp3 = tmpz[6].re - tmpz[7].re;
"lwc1 %[pom2], 4(%[tmpz]) \n\t"
"sub.s %[pom1], %[pom], %[tmp5] \n\t"
"sub.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im - tmpz[7].im;
"add.s %[pom3], %[pom], %[tmp5] \n\t"
"sub.s %[pom], %[pom2], %[tmp6] \n\t"
"add.s %[pom2], %[pom2], %[tmp6] \n\t"
"swc1 %[pom1], 32(%[tmpz]) \n\t" // tmpz[4].re = tmpz[0].re - tmp5;
"swc1 %[pom3], 0(%[tmpz]) \n\t" // tmpz[0].re = tmpz[0].re + tmp5;
"swc1 %[pom], 36(%[tmpz]) \n\t" // tmpz[4].im = tmpz[0].im - tmp6;
"swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6;
"lwc1 %[pom1], 16(%[tmpz]) \n\t"
"lwc1 %[pom3], 20(%[tmpz]) \n\t"
"add.s %[temp1],%[tmp1], %[tmp2] \n\t"
"sub.s %[temp], %[pom1], %[tmp8] \n\t"
"add.s %[pom2], %[pom3], %[tmp7] \n\t"
"sub.s %[temp3],%[tmp3], %[tmp4] \n\t"
"sub.s %[temp4],%[tmp2], %[tmp1] \n\t"
"swc1 %[temp], 48(%[tmpz]) \n\t" // tmpz[6].re = tmpz[2].re - tmp8;
"swc1 %[pom2], 52(%[tmpz]) \n\t" // tmpz[6].im = tmpz[2].im + tmp7;
"add.s %[pom1], %[pom1], %[tmp8] \n\t"
"sub.s %[pom3], %[pom3], %[tmp7] \n\t"
"add.s %[tmp3], %[tmp3], %[tmp4] \n\t"
"mul.s %[tmp5], %[f1], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
"mul.s %[tmp7], %[f1], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
"mul.s %[tmp6], %[f1], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
"mul.s %[tmp8], %[f1], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
"swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8;
"swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7;
"add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7;
"sub.s %[tmp3], %[tmp5], %[tmp7] \n\t" // tmp3 = tmp5 - tmp7;
"add.s %[tmp2], %[tmp6], %[tmp8] \n\t" // tmp2 = tmp6 + tmp8;
"sub.s %[tmp4], %[tmp6], %[tmp8] \n\t" // tmp4 = tmp6 - tmp8;
"lwc1 %[temp], 8(%[tmpz]) \n\t"
"lwc1 %[temp1],12(%[tmpz]) \n\t"
"lwc1 %[pom], 24(%[tmpz]) \n\t"
"lwc1 %[pom2], 28(%[tmpz]) \n\t"
"sub.s %[temp4],%[temp], %[tmp1] \n\t"
"sub.s %[temp3],%[temp1], %[tmp2] \n\t"
"add.s %[temp], %[temp], %[tmp1] \n\t"
"add.s %[temp1],%[temp1], %[tmp2] \n\t"
"sub.s %[pom1], %[pom], %[tmp4] \n\t"
"add.s %[pom3], %[pom2], %[tmp3] \n\t"
"add.s %[pom], %[pom], %[tmp4] \n\t"
"sub.s %[pom2], %[pom2], %[tmp3] \n\t"
"swc1 %[temp4],40(%[tmpz]) \n\t" // tmpz[5].re = tmpz[1].re - tmp1;
"swc1 %[temp3],44(%[tmpz]) \n\t" // tmpz[5].im = tmpz[1].im - tmp2;
"swc1 %[temp], 8(%[tmpz]) \n\t" // tmpz[1].re = tmpz[1].re + tmp1;
"swc1 %[temp1],12(%[tmpz]) \n\t" // tmpz[1].im = tmpz[1].im + tmp2;
"swc1 %[pom1], 56(%[tmpz]) \n\t" // tmpz[7].re = tmpz[3].re - tmp4;
"swc1 %[pom3], 60(%[tmpz]) \n\t" // tmpz[7].im = tmpz[3].im + tmp3;
"swc1 %[pom], 24(%[tmpz]) \n\t" // tmpz[3].re = tmpz[3].re + tmp4;
"swc1 %[pom2], 28(%[tmpz]) \n\t" // tmpz[3].im = tmpz[3].im - tmp3;
: [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
[tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp7]"=&f"(tmp7),
[tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
[temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
: [tmpz]"r"(tmpz), [f1]"f"(f1)
: "memory"
);
}
step = 1 << (MAX_LOG2_NFFT - 4);
n4 = 4;
for (nbits=4; nbits<=s->nbits; nbits++) {
num_transforms = (num_transforms >> 1) | 1;
n2 = 2 * n4;
n34 = 3 * n4;
for (n=0; n<num_transforms; n++) {
offset = ff_fft_offsets_lut[n] << nbits;
tmpz = z + offset;
tmpz_n2 = tmpz + n2;
tmpz_n4 = tmpz + n4;
tmpz_n34 = tmpz + n34;
__asm__ volatile (
"lwc1 %[pom1], 0(%[tmpz_n2]) \n\t"
"lwc1 %[pom], 0(%[tmpz_n34]) \n\t"
"lwc1 %[pom2], 4(%[tmpz_n2]) \n\t"
"lwc1 %[pom3], 4(%[tmpz_n34]) \n\t"
"lwc1 %[temp1],0(%[tmpz]) \n\t"
"lwc1 %[temp3],4(%[tmpz]) \n\t"
"add.s %[tmp5], %[pom1], %[pom] \n\t" // tmp5 = tmpz[ n2].re + tmpz[n34].re;
"sub.s %[tmp1], %[pom1], %[pom] \n\t" // tmp1 = tmpz[ n2].re - tmpz[n34].re;
"add.s %[tmp6], %[pom2], %[pom3] \n\t" // tmp6 = tmpz[ n2].im + tmpz[n34].im;
"sub.s %[tmp2], %[pom2], %[pom3] \n\t" // tmp2 = tmpz[ n2].im - tmpz[n34].im;
"sub.s %[temp], %[temp1], %[tmp5] \n\t"
"add.s %[temp1],%[temp1], %[tmp5] \n\t"
"sub.s %[temp4],%[temp3], %[tmp6] \n\t"
"add.s %[temp3],%[temp3], %[tmp6] \n\t"
"swc1 %[temp], 0(%[tmpz_n2]) \n\t" // tmpz[ n2].re = tmpz[ 0].re - tmp5;
"swc1 %[temp1],0(%[tmpz]) \n\t" // tmpz[ 0].re = tmpz[ 0].re + tmp5;
"lwc1 %[pom1], 0(%[tmpz_n4]) \n\t"
"swc1 %[temp4],4(%[tmpz_n2]) \n\t" // tmpz[ n2].im = tmpz[ 0].im - tmp6;
"lwc1 %[temp], 4(%[tmpz_n4]) \n\t"
"swc1 %[temp3],4(%[tmpz]) \n\t" // tmpz[ 0].im = tmpz[ 0].im + tmp6;
"sub.s %[pom], %[pom1], %[tmp2] \n\t"
"add.s %[pom1], %[pom1], %[tmp2] \n\t"
"add.s %[temp1],%[temp], %[tmp1] \n\t"
"sub.s %[temp], %[temp], %[tmp1] \n\t"
"swc1 %[pom], 0(%[tmpz_n34]) \n\t" // tmpz[n34].re = tmpz[n4].re - tmp2;
"swc1 %[pom1], 0(%[tmpz_n4]) \n\t" // tmpz[ n4].re = tmpz[n4].re + tmp2;
"swc1 %[temp1],4(%[tmpz_n34]) \n\t" // tmpz[n34].im = tmpz[n4].im + tmp1;
"swc1 %[temp], 4(%[tmpz_n4]) \n\t" // tmpz[ n4].im = tmpz[n4].im - tmp1;
: [tmp5]"=&f"(tmp5),
[tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
[tmp2]"=&f"(tmp2), [tmp6]"=&f"(tmp6), [pom3]"=&f"(pom3),
[temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
: [tmpz]"r"(tmpz), [tmpz_n2]"r"(tmpz_n2), [tmpz_n34]"r"(tmpz_n34), [tmpz_n4]"r"(tmpz_n4)
: "memory"
);
w_re_ptr = (float*)(ff_cos_131072 + step);
w_im_ptr = (float*)(ff_cos_131072 + MAX_FFT_SIZE/4 - step);
for (i=1; i<n4; i++) {
w_re = w_re_ptr[0];
w_im = w_im_ptr[0];
tmpz_n2_i = tmpz_n2 + i;
tmpz_n4_i = tmpz_n4 + i;
tmpz_n34_i= tmpz_n34 + i;
tmpz_i = tmpz + i;
__asm__ volatile (
"lwc1 %[temp], 0(%[tmpz_n2_i]) \n\t"
"lwc1 %[temp1], 4(%[tmpz_n2_i]) \n\t"
"lwc1 %[pom], 0(%[tmpz_n34_i]) \n\t"
"lwc1 %[pom1], 4(%[tmpz_n34_i]) \n\t"
"mul.s %[temp3], %[w_im], %[temp] \n\t"
"mul.s %[temp4], %[w_im], %[temp1] \n\t"
"mul.s %[pom2], %[w_im], %[pom1] \n\t"
"mul.s %[pom3], %[w_im], %[pom] \n\t"
"msub.s %[tmp2], %[temp3], %[w_re], %[temp1] \n\t" // tmp2 = w_re * tmpz[ n2+i].im - w_im * tmpz[ n2+i].re;
"madd.s %[tmp1], %[temp4], %[w_re], %[temp] \n\t" // tmp1 = w_re * tmpz[ n2+i].re + w_im * tmpz[ n2+i].im;
"msub.s %[tmp3], %[pom2], %[w_re], %[pom] \n\t" // tmp3 = w_re * tmpz[n34+i].re - w_im * tmpz[n34+i].im;
"madd.s %[tmp4], %[pom3], %[w_re], %[pom1] \n\t" // tmp4 = w_re * tmpz[n34+i].im + w_im * tmpz[n34+i].re;
"lwc1 %[temp], 0(%[tmpz_i]) \n\t"
"lwc1 %[pom], 4(%[tmpz_i]) \n\t"
"add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
"sub.s %[tmp1], %[tmp1], %[tmp3] \n\t" // tmp1 = tmp1 - tmp3;
"add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
"sub.s %[tmp2], %[tmp2], %[tmp4] \n\t" // tmp2 = tmp2 - tmp4;
"sub.s %[temp1], %[temp], %[tmp5] \n\t"
"add.s %[temp], %[temp], %[tmp5] \n\t"
"sub.s %[pom1], %[pom], %[tmp6] \n\t"
"add.s %[pom], %[pom], %[tmp6] \n\t"
"lwc1 %[temp3], 0(%[tmpz_n4_i]) \n\t"
"lwc1 %[pom2], 4(%[tmpz_n4_i]) \n\t"
"swc1 %[temp1], 0(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].re = tmpz[ i].re - tmp5;
"swc1 %[temp], 0(%[tmpz_i]) \n\t" // tmpz[ i].re = tmpz[ i].re + tmp5;
"swc1 %[pom1], 4(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].im = tmpz[ i].im - tmp6;
"swc1 %[pom] , 4(%[tmpz_i]) \n\t" // tmpz[ i].im = tmpz[ i].im + tmp6;
"sub.s %[temp4], %[temp3], %[tmp2] \n\t"
"add.s %[pom3], %[pom2], %[tmp1] \n\t"
"add.s %[temp3], %[temp3], %[tmp2] \n\t"
"sub.s %[pom2], %[pom2], %[tmp1] \n\t"
"swc1 %[temp4], 0(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
"swc1 %[pom3], 4(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
"swc1 %[temp3], 0(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
"swc1 %[pom2], 4(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
: [tmp1]"=&f"(tmp1), [tmp2]"=&f" (tmp2), [temp]"=&f"(temp), [tmp3]"=&f"(tmp3),
[tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp6]"=&f"(tmp6),
[temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
[pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), [pom3]"=&f"(pom3)
: [w_re]"f"(w_re), [w_im]"f"(w_im),
[tmpz_i]"r"(tmpz_i),[tmpz_n2_i]"r"(tmpz_n2_i),
[tmpz_n34_i]"r"(tmpz_n34_i), [tmpz_n4_i]"r"(tmpz_n4_i)
: "memory"
);
w_re_ptr += step;
w_im_ptr -= step;
}
}
step >>= 1;
n4 <<= 1;
}
}
/**
* MDCT/IMDCT transforms.
*/
static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int k, n8, n4, n2, n, j;
const uint16_t *revtab = s->revtab;
const FFTSample *tcos = s->tcos;
const FFTSample *tsin = s->tsin;
const FFTSample *in1, *in2, *in3, *in4;
FFTComplex *z = (FFTComplex *)output;
int j1;
const float *tcos1, *tsin1, *tcos2, *tsin2;
float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
FFTComplex *z1, *z2;
n = 1 << s->mdct_bits;
n2 = n >> 1;
n4 = n >> 2;
n8 = n >> 3;
/* pre rotation */
in1 = input;
in2 = input + n2 - 1;
in3 = input + 2;
in4 = input + n2 - 3;
tcos1 = tcos;
tsin1 = tsin;
/* n4 = 64 or 128 */
for(k = 0; k < n4; k += 2) {
j = revtab[k ];
j1 = revtab[k + 1];
__asm__ volatile (
"lwc1 %[temp1], 0(%[in2]) \t\n"
"lwc1 %[temp2], 0(%[tcos1]) \t\n"
"lwc1 %[temp3], 0(%[tsin1]) \t\n"
"lwc1 %[temp4], 0(%[in1]) \t\n"
"lwc1 %[temp5], 0(%[in4]) \t\n"
"mul.s %[temp9], %[temp1], %[temp2] \t\n"
"mul.s %[temp10], %[temp1], %[temp3] \t\n"
"lwc1 %[temp6], 4(%[tcos1]) \t\n"
"lwc1 %[temp7], 4(%[tsin1]) \t\n"
"nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
"madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
"mul.s %[temp11], %[temp5], %[temp6] \t\n"
"mul.s %[temp12], %[temp5], %[temp7] \t\n"
"lwc1 %[temp8], 0(%[in3]) \t\n"
PTR_ADDIU " %[tcos1], %[tcos1], 8 \t\n"
PTR_ADDIU " %[tsin1], %[tsin1], 8 \t\n"
PTR_ADDIU " %[in1], %[in1], 16 \t\n"
"nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
"madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
PTR_ADDIU " %[in2], %[in2], -16 \t\n"
PTR_ADDIU " %[in3], %[in3], 16 \t\n"
PTR_ADDIU " %[in4], %[in4], -16 \t\n"
: [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
[temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
[temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
[temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
[temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
[temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
[tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1),
[in1]"+r"(in1), [in2]"+r"(in2),
[in3]"+r"(in3), [in4]"+r"(in4)
:
: "memory"
);
z[j ].re = temp9;
z[j ].im = temp10;
z[j1].re = temp11;
z[j1].im = temp12;
}
s->fft_calc(s, z);
/* post rotation + reordering */
/* n8 = 32 or 64 */
for(k = 0; k < n8; k += 2) {
tcos1 = &tcos[n8 - k - 2];
tsin1 = &tsin[n8 - k - 2];
tcos2 = &tcos[n8 + k];
tsin2 = &tsin[n8 + k];
z1 = &z[n8 - k - 2];
z2 = &z[n8 + k ];
__asm__ volatile (
"lwc1 %[temp1], 12(%[z1]) \t\n"
"lwc1 %[temp2], 4(%[tsin1]) \t\n"
"lwc1 %[temp3], 4(%[tcos1]) \t\n"
"lwc1 %[temp4], 8(%[z1]) \t\n"
"lwc1 %[temp5], 4(%[z1]) \t\n"
"mul.s %[temp9], %[temp1], %[temp2] \t\n"
"mul.s %[temp10], %[temp1], %[temp3] \t\n"
"lwc1 %[temp6], 0(%[tsin1]) \t\n"
"lwc1 %[temp7], 0(%[tcos1]) \t\n"
"nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
"madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
"mul.s %[temp11], %[temp5], %[temp6] \t\n"
"mul.s %[temp12], %[temp5], %[temp7] \t\n"
"lwc1 %[temp8], 0(%[z1]) \t\n"
"lwc1 %[temp1], 4(%[z2]) \t\n"
"lwc1 %[temp2], 0(%[tsin2]) \t\n"
"lwc1 %[temp3], 0(%[tcos2]) \t\n"
"nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
"madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
"mul.s %[temp13], %[temp1], %[temp2] \t\n"
"mul.s %[temp14], %[temp1], %[temp3] \t\n"
"lwc1 %[temp4], 0(%[z2]) \t\n"
"lwc1 %[temp5], 12(%[z2]) \t\n"
"lwc1 %[temp6], 4(%[tsin2]) \t\n"
"lwc1 %[temp7], 4(%[tcos2]) \t\n"
"nmsub.s %[temp13], %[temp13], %[temp4], %[temp3] \t\n"
"madd.s %[temp14], %[temp14], %[temp4], %[temp2] \t\n"
"mul.s %[temp15], %[temp5], %[temp6] \t\n"
"mul.s %[temp16], %[temp5], %[temp7] \t\n"
"lwc1 %[temp8], 8(%[z2]) \t\n"
"nmsub.s %[temp15], %[temp15], %[temp8], %[temp7] \t\n"
"madd.s %[temp16], %[temp16], %[temp8], %[temp6] \t\n"
: [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
[temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
[temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
[temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
[temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
[temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
[temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
[temp15]"=&f"(temp15), [temp16]"=&f"(temp16)
: [z1]"r"(z1), [z2]"r"(z2),
[tsin1]"r"(tsin1), [tcos1]"r"(tcos1),
[tsin2]"r"(tsin2), [tcos2]"r"(tcos2)
: "memory"
);
z1[1].re = temp9;
z1[1].im = temp14;
z2[0].re = temp13;
z2[0].im = temp10;
z1[0].re = temp11;
z1[0].im = temp16;
z2[1].re = temp15;
z2[1].im = temp12;
}
}
/**
* Compute inverse MDCT of size N = 2^nbits
* @param output N samples
* @param input N/2 samples
*/
static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int k;
int n = 1 << s->mdct_bits;
int n2 = n >> 1;
int n4 = n >> 2;
ff_imdct_half_mips(s, output+n4, input);
for(k = 0; k < n4; k+=4) {
output[k] = -output[n2-k-1];
output[k+1] = -output[n2-k-2];
output[k+2] = -output[n2-k-3];
output[k+3] = -output[n2-k-4];
output[n-k-1] = output[n2+k];
output[n-k-2] = output[n2+k+1];
output[n-k-3] = output[n2+k+2];
output[n-k-4] = output[n2+k+3];
}
}
#endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
#endif /* HAVE_INLINE_ASM */
av_cold void ff_fft_init_mips(FFTContext *s)
{
ff_fft_lut_init();
ff_init_ff_cos_tabs(17);
#if HAVE_INLINE_ASM
#if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
s->fft_calc = ff_fft_calc_mips;
#if CONFIG_MDCT
s->imdct_calc = ff_imdct_calc_mips;
s->imdct_half = ff_imdct_half_mips;
#endif
#endif
#endif
}

View File

@ -1,9 +1,6 @@
# subsystems
OBJS-$(CONFIG_AUDIODSP) += ppc/audiodsp.o
OBJS-$(CONFIG_BLOCKDSP) += ppc/blockdsp.o
OBJS-$(CONFIG_FFT) += ppc/fft_init.o \
ppc/fft_altivec.o \
ppc/fft_vsx.o
OBJS-$(CONFIG_FDCTDSP) += ppc/fdctdsp.o
OBJS-$(CONFIG_FMTCONVERT) += ppc/fmtconvert_altivec.o
OBJS-$(CONFIG_H264CHROMA) += ppc/h264chroma_init.o

View File

@ -1,458 +0,0 @@
/*
* FFT transform with Altivec optimizations
* Copyright (c) 2009 Loren Merritt
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* These functions are not individually interchangeable with the C versions.
* While C takes arrays of FFTComplex, Altivec leaves intermediate results
* in blocks as convenient to the vector size.
* i.e. {4x real, 4x imaginary, 4x real, ...}
*
* I ignore standard calling convention.
* Instead, the following registers are treated as global constants:
* v14: zero
* v15..v18: cosines
* v19..v29: permutations
* r9: 16
* r12: ff_cos_tabs
* and the rest are free for local use.
*/
#include "config.h"
#if HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN
#include "asm.S"
.text
.macro addi2 ra, imm // add 32-bit immediate
.if \imm & 0xffff
addi \ra, \ra, \imm@l
.endif
.if (\imm+0x8000)>>16
addis \ra, \ra, \imm@ha
.endif
.endm
.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
.endm
.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
vperm \b2,\b0,\b1,v20
vperm \b3,\b0,\b1,v21
vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
vaddfp \b0,\b2,\b3
vsubfp \b1,\b2,\b3
vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
vmrghw \b2,\b0,\b1
vperm \b3,\b0,\b1,v22
vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
vaddfp \b0,\b2,\b3
vsubfp \b1,\b2,\b3
vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
vperm \b2,\b0,\b1,v23
vperm \b3,\b0,\b1,v24
.endm
.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
vperm \a2,\a0,\a1,v20 // FFT4 ...
vperm \a3,\a0,\a1,v21
vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
vaddfp \a0,\a2,\a3
vsubfp \a1,\a2,\a3
vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
vmrghw \a2,\a0,\a1
vperm \a3,\a0,\a1,v22
vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
vaddfp \a0,\a2,\a3
vsubfp \a1,\a2,\a3
vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
vperm \a2,\a0,\a1,v23
vperm \a3,\a0,\a1,v24
vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
.endm
.macro BF d0,d1,s0,s1
vsubfp \d1,\s0,\s1
vaddfp \d0,\s0,\s1
.endm
.macro zip d0,d1,s0,s1
vmrghw \d0,\s0,\s1
vmrglw \d1,\s0,\s1
.endm
.macro def_fft4 interleave
fft4\interleave\()_altivec:
lvx v0, 0,r3
lvx v1,r9,r3
FFT4 v0,v1,v2,v3
.ifnb \interleave
zip v0,v1,v2,v3
stvx v0, 0,r3
stvx v1,r9,r3
.else
stvx v2, 0,r3
stvx v3,r9,r3
.endif
blr
.endm
.macro def_fft8 interleave
fft8\interleave\()_altivec:
addi r4,r3,32
lvx v0, 0,r3
lvx v1,r9,r3
lvx v2, 0,r4
lvx v3,r9,r4
FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
.ifnb \interleave
zip v4,v5,v0,v1
zip v6,v7,v2,v3
stvx v4, 0,r3
stvx v5,r9,r3
stvx v6, 0,r4
stvx v7,r9,r4
.else
stvx v0, 0,r3
stvx v1,r9,r3
stvx v2, 0,r4
stvx v3,r9,r4
.endif
blr
.endm
.macro def_fft16 interleave
fft16\interleave\()_altivec:
addi r5,r3,64
addi r6,r3,96
addi r4,r3,32
lvx v0, 0,r5
lvx v1,r9,r5
lvx v2, 0,r6
lvx v3,r9,r6
FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
lvx v0, 0,r3
lvx v1,r9,r3
lvx v2, 0,r4
lvx v3,r9,r4
FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
vmaddfp v8,v4,v15,v14 // r2*wre
vmaddfp v9,v5,v15,v14 // i2*wre
vmaddfp v10,v6,v15,v14 // r3*wre
vmaddfp v11,v7,v15,v14 // i3*wre
vmaddfp v8,v5,v16,v8 // i2*wim
vnmsubfp v9,v4,v16,v9 // r2*wim
vnmsubfp v10,v7,v16,v10 // i3*wim
vmaddfp v11,v6,v16,v11 // r3*wim
BF v10,v12,v10,v8
BF v11,v13,v9,v11
BF v0,v4,v0,v10
BF v3,v7,v3,v12
BF v1,v5,v1,v11
BF v2,v6,v2,v13
.ifnb \interleave
zip v8, v9,v0,v1
zip v10,v11,v2,v3
zip v12,v13,v4,v5
zip v14,v15,v6,v7
stvx v8, 0,r3
stvx v9,r9,r3
stvx v10, 0,r4
stvx v11,r9,r4
stvx v12, 0,r5
stvx v13,r9,r5
stvx v14, 0,r6
stvx v15,r9,r6
.else
stvx v0, 0,r3
stvx v4, 0,r5
stvx v3,r9,r4
stvx v7,r9,r6
stvx v1,r9,r3
stvx v5,r9,r5
stvx v2, 0,r4
stvx v6, 0,r6
.endif
blr
.endm
// void pass(float *z, float *wre, int n)
.macro PASS interleave, suffix
fft_pass\suffix\()_altivec:
mtctr r5
slwi r0,r5,4
slwi r7,r5,6 // o2
slwi r5,r5,5 // o1
add r10,r5,r7 // o3
add r0,r4,r0 // wim
addi r6,r5,16 // o1+16
addi r8,r7,16 // o2+16
addi r11,r10,16 // o3+16
1:
lvx v8, 0,r4 // wre
lvx v10, 0,r0 // wim
sub r0,r0,r9
lvx v9, 0,r0
vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
lvx v4,r3,r7 // r2 = z[o2]
lvx v5,r3,r8 // i2 = z[o2+16]
lvx v6,r3,r10 // r3 = z[o3]
lvx v7,r3,r11 // i3 = z[o3+16]
vmaddfp v10,v4,v8,v14 // r2*wre
vmaddfp v11,v5,v8,v14 // i2*wre
vmaddfp v12,v6,v8,v14 // r3*wre
vmaddfp v13,v7,v8,v14 // i3*wre
lvx v0, 0,r3 // r0 = z[0]
lvx v3,r3,r6 // i1 = z[o1+16]
vmaddfp v10,v5,v9,v10 // i2*wim
vnmsubfp v11,v4,v9,v11 // r2*wim
vnmsubfp v12,v7,v9,v12 // i3*wim
vmaddfp v13,v6,v9,v13 // r3*wim
lvx v1,r3,r9 // i0 = z[16]
lvx v2,r3,r5 // r1 = z[o1]
BF v12,v8,v12,v10
BF v13,v9,v11,v13
BF v0,v4,v0,v12
BF v3,v7,v3,v8
.if !\interleave
stvx v0, 0,r3
stvx v4,r3,r7
stvx v3,r3,r6
stvx v7,r3,r11
.endif
BF v1,v5,v1,v13
BF v2,v6,v2,v9
.if !\interleave
stvx v1,r3,r9
stvx v2,r3,r5
stvx v5,r3,r8
stvx v6,r3,r10
.else
vmrghw v8,v0,v1
vmrglw v9,v0,v1
stvx v8, 0,r3
stvx v9,r3,r9
vmrghw v8,v2,v3
vmrglw v9,v2,v3
stvx v8,r3,r5
stvx v9,r3,r6
vmrghw v8,v4,v5
vmrglw v9,v4,v5
stvx v8,r3,r7
stvx v9,r3,r8
vmrghw v8,v6,v7
vmrglw v9,v6,v7
stvx v8,r3,r10
stvx v9,r3,r11
.endif
addi r3,r3,32
addi r4,r4,16
bdnz 1b
sub r3,r3,r5
blr
.endm
#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
#define WORD_0 0x00,0x01,0x02,0x03
#define WORD_1 0x04,0x05,0x06,0x07
#define WORD_2 0x08,0x09,0x0a,0x0b
#define WORD_3 0x0c,0x0d,0x0e,0x0f
#define WORD_s0 0x10,0x11,0x12,0x13
#define WORD_s1 0x14,0x15,0x16,0x17
#define WORD_s2 0x18,0x19,0x1a,0x1b
#define WORD_s3 0x1c,0x1d,0x1e,0x1f
#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
.rodata
.align 4
fft_data:
.float 0, 0, 0, 0
.float 1, 0.92387953, M_SQRT1_2, 0.38268343
.float 0, 0.38268343, M_SQRT1_2, 0.92387953
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
.float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
vcprm(s0,3,2,1)
vcprm(0,1,s2,s1)
vcprm(2,3,s0,s3)
vcprm(2,s3,3,s2)
vcprm(0,1,s0,s1)
vcprm(2,3,s2,s3)
vcprm(2,3,0,1)
vcprm(1,2,s3,s0)
vcprm(0,3,s2,s1)
vcprm(0,2,s1,s3)
vcprm(1,3,s0,s2)
.macro lvm b, r, regs:vararg
lvx \r, 0, \b
addi \b, \b, 16
.ifnb \regs
lvm \b, \regs
.endif
.endm
.macro stvm b, r, regs:vararg
stvx \r, 0, \b
addi \b, \b, 16
.ifnb \regs
stvm \b, \regs
.endif
.endm
.macro fft_calc interleave
extfunc ff_fft_calc\interleave\()_altivec
mflr r0
stp r0, 2*PS(R(1))
stpu r1, -(160+16*PS)(R(1))
get_got r11
addi r6, r1, 16*PS
stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
mfvrsave r0
stw r0, 15*PS(R(1))
#if __APPLE__
li r6, 0xfffffffc
#else
li r6, -4
#endif
mtvrsave r6
movrel r6, fft_data, r11
lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
lvm r6, v22, v23, v24, v25, v26, v27, v28, v29
li r9, 16
movrel r12, X(ff_cos_tabs), r11
movrel r6, fft_dispatch_tab\interleave\()_altivec, r11
lwz r3, 0(R(3))
subi r3, r3, 2
slwi r3, r3, 2+ARCH_PPC64
lpx r3, r3, r6
mtctr r3
mr r3, r4
bctrl
addi r6, r1, 16*PS
lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lwz r6, 15*PS(R(1))
mtvrsave r6
lp r1, 0(R(1))
lp r0, 2*PS(R(1))
mtlr r0
blr
.endm
.macro DECL_FFT suffix, bits, n, n2, n4
fft\n\suffix\()_altivec:
mflr r0
stp r0,PS*(\bits-3)(R(1))
bl fft\n2\()_altivec
addi2 r3,\n*4
bl fft\n4\()_altivec
addi2 r3,\n*2
bl fft\n4\()_altivec
addi2 r3,\n*-6
lp r0,PS*(\bits-3)(R(1))
lp r4,\bits*PS(R(12))
mtlr r0
li r5,\n/16
b fft_pass\suffix\()_altivec
.endm
.macro DECL_FFTS interleave, suffix
.text
def_fft4 \suffix
def_fft8 \suffix
def_fft16 \suffix
PASS \interleave, \suffix
DECL_FFT \suffix, 5, 32, 16, 8
DECL_FFT \suffix, 6, 64, 32, 16
DECL_FFT \suffix, 7, 128, 64, 32
DECL_FFT \suffix, 8, 256, 128, 64
DECL_FFT \suffix, 9, 512, 256, 128
DECL_FFT \suffix,10, 1024, 512, 256
DECL_FFT \suffix,11, 2048, 1024, 512
DECL_FFT \suffix,12, 4096, 2048, 1024
DECL_FFT \suffix,13, 8192, 4096, 2048
DECL_FFT \suffix,14,16384, 8192, 4096
DECL_FFT \suffix,15,32768,16384, 8192
DECL_FFT \suffix,16,65536,32768,16384
fft_calc \suffix
.rodata
.align 3
fft_dispatch_tab\suffix\()_altivec:
PTR fft4\suffix\()_altivec
PTR fft8\suffix\()_altivec
PTR fft16\suffix\()_altivec
PTR fft32\suffix\()_altivec
PTR fft64\suffix\()_altivec
PTR fft128\suffix\()_altivec
PTR fft256\suffix\()_altivec
PTR fft512\suffix\()_altivec
PTR fft1024\suffix\()_altivec
PTR fft2048\suffix\()_altivec
PTR fft4096\suffix\()_altivec
PTR fft8192\suffix\()_altivec
PTR fft16384\suffix\()_altivec
PTR fft32768\suffix\()_altivec
PTR fft65536\suffix\()_altivec
.endm
DECL_FFTS 0
DECL_FFTS 1, _interleave
#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */

View File

@ -1,168 +0,0 @@
/*
* FFT/IFFT transforms
* AltiVec-enabled
* Copyright (c) 2009 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
/**
* Do a complex FFT with the parameters defined in ff_fft_init().
* The input data must be permuted before with s->revtab table.
* No 1.0 / sqrt(n) normalization is done.
* AltiVec-enabled:
* This code assumes that the 'z' pointer is 16 bytes-aligned.
* It also assumes all FFTComplex are 8 bytes-aligned pairs of floats.
*/
#if HAVE_VSX
#include "fft_vsx.h"
#else
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
#endif
#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX)
static void imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int j, k;
int n = 1 << s->mdct_bits;
int n4 = n >> 2;
int n8 = n >> 3;
int n32 = n >> 5;
const uint16_t *revtabj = s->revtab;
const uint16_t *revtabk = s->revtab+n4;
const vec_f *tcos = (const vec_f*)(s->tcos+n8);
const vec_f *tsin = (const vec_f*)(s->tsin+n8);
const vec_f *pin = (const vec_f*)(input+n4);
vec_f *pout = (vec_f*)(output+n4);
/* pre rotation */
k = n32-1;
do {
vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
#define CMULA(p,o0,o1,o2,o3)\
a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\
b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\
im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\
cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
r##p = im*cos - re*sin;\
i##p = re*cos + im*sin;
#define STORE2(v,dst)\
j = dst;\
vec_ste(v, 0, output+j*2);\
vec_ste(v, 4, output+j*2);
#define STORE8(p)\
a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
STORE2(a, revtabk[ p*2-4]);\
STORE2(b, revtabk[ p*2-3]);\
STORE2(c, revtabj[-p*2+2]);\
STORE2(d, revtabj[-p*2+3]);
cos0 = tcos[k];
sin0 = tsin[k];
cos1 = tcos[-k-1];
sin1 = tsin[-k-1];
CMULA(0, 0,1,2,3);
CMULA(1, 2,3,0,1);
STORE8(0);
STORE8(1);
revtabj += 4;
revtabk -= 4;
k--;
} while(k >= 0);
#if HAVE_VSX
ff_fft_calc_vsx(s, (FFTComplex*)output);
#else
ff_fft_calc_altivec(s, (FFTComplex*)output);
#endif
/* post rotation + reordering */
j = -n32;
k = n32-1;
do {
vec_f cos,sin,re,im,a,b,c,d;
#define CMULB(d0,d1,o)\
re = pout[o*2];\
im = pout[o*2+1];\
cos = tcos[o];\
sin = tsin[o];\
d0 = im*sin - re*cos;\
d1 = re*sin + im*cos;
CMULB(a,b,j);
CMULB(c,d,k);
pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2));
pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2));
pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
j++;
k--;
} while(k >= 0);
}
static void imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int k;
int n = 1 << s->mdct_bits;
int n4 = n >> 2;
int n16 = n >> 4;
vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
vec_u32 *p0 = (vec_u32*)(output+n4);
vec_u32 *p1 = (vec_u32*)(output+n4*3);
imdct_half_altivec(s, output + n4, input);
for (k = 0; k < n16; k++) {
vec_u32 a = p0[k] ^ sign;
vec_u32 b = p1[-k-1];
p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
p1[k] = vec_perm(b, b, vcprm(3,2,1,0));
}
}
#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX) */
av_cold void ff_fft_init_ppc(FFTContext *s)
{
#if HAVE_GNU_AS && HAVE_ALTIVEC && (HAVE_BIGENDIAN || HAVE_VSX)
if (!PPC_ALTIVEC(av_get_cpu_flags()))
return;
#if HAVE_VSX
s->fft_calc = ff_fft_calc_interleave_vsx;
#else
s->fft_calc = ff_fft_calc_interleave_altivec;
#endif
if (s->mdct_bits >= 5) {
s->imdct_calc = imdct_calc_altivec;
s->imdct_half = imdct_half_altivec;
}
#endif /* HAVE_GNU_AS && HAVE_ALTIVEC && HAVE_BIGENDIAN */
}

View File

@ -1,226 +0,0 @@
/*
* FFT transform, optimized with VSX built-in functions
* Copyright (c) 2014 Rong Yan
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
#include "libavcodec/fft-internal.h"
#include "fft_vsx.h"
#if HAVE_VSX
static void fft32_vsx_interleave(FFTComplex *z)
{
fft16_vsx_interleave(z);
fft8_vsx_interleave(z+16);
fft8_vsx_interleave(z+24);
pass_vsx_interleave(z,ff_cos_32,4);
}
static void fft64_vsx_interleave(FFTComplex *z)
{
fft32_vsx_interleave(z);
fft16_vsx_interleave(z+32);
fft16_vsx_interleave(z+48);
pass_vsx_interleave(z,ff_cos_64, 8);
}
static void fft128_vsx_interleave(FFTComplex *z)
{
fft64_vsx_interleave(z);
fft32_vsx_interleave(z+64);
fft32_vsx_interleave(z+96);
pass_vsx_interleave(z,ff_cos_128,16);
}
static void fft256_vsx_interleave(FFTComplex *z)
{
fft128_vsx_interleave(z);
fft64_vsx_interleave(z+128);
fft64_vsx_interleave(z+192);
pass_vsx_interleave(z,ff_cos_256,32);
}
static void fft512_vsx_interleave(FFTComplex *z)
{
fft256_vsx_interleave(z);
fft128_vsx_interleave(z+256);
fft128_vsx_interleave(z+384);
pass_vsx_interleave(z,ff_cos_512,64);
}
static void fft1024_vsx_interleave(FFTComplex *z)
{
fft512_vsx_interleave(z);
fft256_vsx_interleave(z+512);
fft256_vsx_interleave(z+768);
pass_vsx_interleave(z,ff_cos_1024,128);
}
static void fft2048_vsx_interleave(FFTComplex *z)
{
fft1024_vsx_interleave(z);
fft512_vsx_interleave(z+1024);
fft512_vsx_interleave(z+1536);
pass_vsx_interleave(z,ff_cos_2048,256);
}
static void fft4096_vsx_interleave(FFTComplex *z)
{
fft2048_vsx_interleave(z);
fft1024_vsx_interleave(z+2048);
fft1024_vsx_interleave(z+3072);
pass_vsx_interleave(z,ff_cos_4096, 512);
}
static void fft8192_vsx_interleave(FFTComplex *z)
{
fft4096_vsx_interleave(z);
fft2048_vsx_interleave(z+4096);
fft2048_vsx_interleave(z+6144);
pass_vsx_interleave(z,ff_cos_8192,1024);
}
static void fft16384_vsx_interleave(FFTComplex *z)
{
fft8192_vsx_interleave(z);
fft4096_vsx_interleave(z+8192);
fft4096_vsx_interleave(z+12288);
pass_vsx_interleave(z,ff_cos_16384,2048);
}
static void fft32768_vsx_interleave(FFTComplex *z)
{
fft16384_vsx_interleave(z);
fft8192_vsx_interleave(z+16384);
fft8192_vsx_interleave(z+24576);
pass_vsx_interleave(z,ff_cos_32768,4096);
}
static void fft65536_vsx_interleave(FFTComplex *z)
{
fft32768_vsx_interleave(z);
fft16384_vsx_interleave(z+32768);
fft16384_vsx_interleave(z+49152);
pass_vsx_interleave(z,ff_cos_65536,8192);
}
static void fft32_vsx(FFTComplex *z)
{
fft16_vsx(z);
fft8_vsx(z+16);
fft8_vsx(z+24);
pass_vsx(z,ff_cos_32,4);
}
static void fft64_vsx(FFTComplex *z)
{
fft32_vsx(z);
fft16_vsx(z+32);
fft16_vsx(z+48);
pass_vsx(z,ff_cos_64, 8);
}
static void fft128_vsx(FFTComplex *z)
{
fft64_vsx(z);
fft32_vsx(z+64);
fft32_vsx(z+96);
pass_vsx(z,ff_cos_128,16);
}
static void fft256_vsx(FFTComplex *z)
{
fft128_vsx(z);
fft64_vsx(z+128);
fft64_vsx(z+192);
pass_vsx(z,ff_cos_256,32);
}
static void fft512_vsx(FFTComplex *z)
{
fft256_vsx(z);
fft128_vsx(z+256);
fft128_vsx(z+384);
pass_vsx(z,ff_cos_512,64);
}
static void fft1024_vsx(FFTComplex *z)
{
fft512_vsx(z);
fft256_vsx(z+512);
fft256_vsx(z+768);
pass_vsx(z,ff_cos_1024,128);
}
static void fft2048_vsx(FFTComplex *z)
{
fft1024_vsx(z);
fft512_vsx(z+1024);
fft512_vsx(z+1536);
pass_vsx(z,ff_cos_2048,256);
}
static void fft4096_vsx(FFTComplex *z)
{
fft2048_vsx(z);
fft1024_vsx(z+2048);
fft1024_vsx(z+3072);
pass_vsx(z,ff_cos_4096, 512);
}
static void fft8192_vsx(FFTComplex *z)
{
fft4096_vsx(z);
fft2048_vsx(z+4096);
fft2048_vsx(z+6144);
pass_vsx(z,ff_cos_8192,1024);
}
static void fft16384_vsx(FFTComplex *z)
{
fft8192_vsx(z);
fft4096_vsx(z+8192);
fft4096_vsx(z+12288);
pass_vsx(z,ff_cos_16384,2048);
}
static void fft32768_vsx(FFTComplex *z)
{
fft16384_vsx(z);
fft8192_vsx(z+16384);
fft8192_vsx(z+24576);
pass_vsx(z,ff_cos_32768,4096);
}
static void fft65536_vsx(FFTComplex *z)
{
fft32768_vsx(z);
fft16384_vsx(z+32768);
fft16384_vsx(z+49152);
pass_vsx(z,ff_cos_65536,8192);
}
static void (* const fft_dispatch_vsx[])(FFTComplex*) = {
fft4_vsx, fft8_vsx, fft16_vsx, fft32_vsx, fft64_vsx, fft128_vsx, fft256_vsx, fft512_vsx, fft1024_vsx,
fft2048_vsx, fft4096_vsx, fft8192_vsx, fft16384_vsx, fft32768_vsx, fft65536_vsx,
};
static void (* const fft_dispatch_vsx_interleave[])(FFTComplex*) = {
fft4_vsx_interleave, fft8_vsx_interleave, fft16_vsx_interleave, fft32_vsx_interleave, fft64_vsx_interleave,
fft128_vsx_interleave, fft256_vsx_interleave, fft512_vsx_interleave, fft1024_vsx_interleave,
fft2048_vsx_interleave, fft4096_vsx_interleave, fft8192_vsx_interleave, fft16384_vsx_interleave, fft32768_vsx_interleave, fft65536_vsx_interleave,
};
void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z)
{
fft_dispatch_vsx_interleave[s->nbits-2](z);
}
void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z)
{
fft_dispatch_vsx[s->nbits-2](z);
}
#endif /* HAVE_VSX */

View File

@ -1,829 +0,0 @@
#ifndef AVCODEC_PPC_FFT_VSX_H
#define AVCODEC_PPC_FFT_VSX_H
/*
* FFT transform, optimized with VSX built-in functions
* Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
#include "libavcodec/fft-internal.h"
#if HAVE_VSX
void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
#define byte_2complex (2*sizeof(FFTComplex))
#define byte_4complex (4*sizeof(FFTComplex))
#define byte_6complex (6*sizeof(FFTComplex))
#define byte_8complex (8*sizeof(FFTComplex))
#define byte_10complex (10*sizeof(FFTComplex))
#define byte_12complex (12*sizeof(FFTComplex))
#define byte_14complex (14*sizeof(FFTComplex))
inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
{
int o1 = n<<1;
int o2 = n<<2;
int o3 = o1+o2;
int i1, i2, i3;
FFTSample* out = (FFTSample*)z;
const FFTSample *wim = wre+o1;
vec_f vz0, vzo1, vzo2, vzo3;
vec_f x0, x1, x2, x3;
vec_f x4, x5, x6, x7;
vec_f x8, x9, x10, x11;
vec_f x12, x13, x14, x15;
vec_f x16, x17, x18, x19;
vec_f x20, x21, x22, x23;
vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
vec_f y0, y1, y2, y3;
vec_f y4, y5, y8, y9;
vec_f y10, y13, y14, y15;
vec_f y16, y17, y18, y19;
vec_f y20, y21, y22, y23;
vec_f wr1, wi1, wr0, wi0;
vec_f wr2, wi2, wr3, wi3;
vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
n = n-2;
i1 = o1*sizeof(FFTComplex);
i2 = o2*sizeof(FFTComplex);
i3 = o3*sizeof(FFTComplex);
vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
vzo2plus1 = vec_ld(i2+16, &(out[0]));
vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
vzo3plus1 = vec_ld(i3+16, &(out[0]));
vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
vz0plus1 = vec_ld(16, &(out[0]));
vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
vzo1plus1 = vec_ld(i1+16, &(out[0]));
x0 = vec_add(vzo2, vzo3);
x1 = vec_sub(vzo2, vzo3);
y0 = vec_add(vzo2plus1, vzo3plus1);
y1 = vec_sub(vzo2plus1, vzo3plus1);
wr1 = vec_splats(wre[1]);
wi1 = vec_splats(wim[-1]);
wi2 = vec_splats(wim[-2]);
wi3 = vec_splats(wim[-3]);
wr2 = vec_splats(wre[2]);
wr3 = vec_splats(wre[3]);
x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
ymulwi2 = vec_mul(y4, wi2);
ymulwi3 = vec_mul(y5, wi3);
x4 = vec_mul(x2, wr1);
x5 = vec_mul(x3, wi1);
y8 = vec_madd(y2, wr2, ymulwi2);
y9 = vec_msub(y2, wr2, ymulwi2);
x6 = vec_add(x4, x5);
x7 = vec_sub(x4, x5);
y13 = vec_madd(y3, wr3, ymulwi3);
y14 = vec_msub(y3, wr3, ymulwi3);
x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
x11 = vec_add(vz0, x9);
x12 = vec_sub(vz0, x9);
x13 = vec_add(vzo1, x10);
x14 = vec_sub(vzo1, x10);
y18 = vec_add(vz0plus1, y16);
y19 = vec_sub(vz0plus1, y16);
y20 = vec_add(vzo1plus1, y17);
y21 = vec_sub(vzo1plus1, y17);
x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
vec_st(x11, 0, &(out[0]));
vec_st(y18, 16, &(out[0]));
vec_st(x15, i1, &(out[0]));
vec_st(y22, i1+16, &(out[0]));
vec_st(x12, i2, &(out[0]));
vec_st(y19, i2+16, &(out[0]));
vec_st(x16, i3, &(out[0]));
vec_st(y23, i3+16, &(out[0]));
do {
out += 8;
wre += 4;
wim -= 4;
wr0 = vec_splats(wre[0]);
wr1 = vec_splats(wre[1]);
wi0 = vec_splats(wim[0]);
wi1 = vec_splats(wim[-1]);
wr2 = vec_splats(wre[2]);
wr3 = vec_splats(wre[3]);
wi2 = vec_splats(wim[-2]);
wi3 = vec_splats(wim[-3]);
vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
vzo2plus1 = vec_ld(i2+16, &(out[0]));
vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
vzo3plus1 = vec_ld(i3+16, &(out[0]));
vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
vz0plus1 = vec_ld(16, &(out[0]));
vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
vzo1plus1 = vec_ld(i1+16, &(out[0]));
x0 = vec_add(vzo2, vzo3);
x1 = vec_sub(vzo2, vzo3);
y0 = vec_add(vzo2plus1, vzo3plus1);
y1 = vec_sub(vzo2plus1, vzo3plus1);
x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
xmulwi0 = vec_mul(x4, wi0);
xmulwi1 = vec_mul(x5, wi1);
y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
x8 = vec_madd(x2, wr0, xmulwi0);
x9 = vec_msub(x2, wr0, xmulwi0);
ymulwi2 = vec_mul(y4, wi2);
ymulwi3 = vec_mul(y5, wi3);
x13 = vec_madd(x3, wr1, xmulwi1);
x14 = vec_msub(x3, wr1, xmulwi1);
y8 = vec_madd(y2, wr2, ymulwi2);
y9 = vec_msub(y2, wr2, ymulwi2);
y13 = vec_madd(y3, wr3, ymulwi3);
y14 = vec_msub(y3, wr3, ymulwi3);
x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
x18 = vec_add(vz0, x16);
x19 = vec_sub(vz0, x16);
x20 = vec_add(vzo1, x17);
x21 = vec_sub(vzo1, x17);
y18 = vec_add(vz0plus1, y16);
y19 = vec_sub(vz0plus1, y16);
y20 = vec_add(vzo1plus1, y17);
y21 = vec_sub(vzo1plus1, y17);
x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
vec_st(x18, 0, &(out[0]));
vec_st(y18, 16, &(out[0]));
vec_st(x22, i1, &(out[0]));
vec_st(y22, i1+16, &(out[0]));
vec_st(x19, i2, &(out[0]));
vec_st(y19, i2+16, &(out[0]));
vec_st(x23, i3, &(out[0]));
vec_st(y23, i3+16, &(out[0]));
} while (n-=2);
}
inline static void fft2_vsx_interleave(FFTComplex *z)
{
FFTSample r1, i1;
r1 = z[0].re - z[1].re;
z[0].re += z[1].re;
z[1].re = r1;
i1 = z[0].im - z[1].im;
z[0].im += z[1].im;
z[1].im = i1;
}
inline static void fft4_vsx_interleave(FFTComplex *z)
{
vec_f a, b, c, d;
float* out= (float*)z;
a = vec_ld(0, &(out[0]));
b = vec_ld(byte_2complex, &(out[0]));
c = vec_perm(a, b, vcprm(0,1,s2,s1));
d = vec_perm(a, b, vcprm(2,3,s0,s3));
a = vec_add(c, d);
b = vec_sub(c, d);
c = vec_perm(a, b, vcprm(0,1,s0,s1));
d = vec_perm(a, b, vcprm(2,3,s3,s2));
a = vec_add(c, d);
b = vec_sub(c, d);
vec_st(a, 0, &(out[0]));
vec_st(b, byte_2complex, &(out[0]));
}
inline static void fft8_vsx_interleave(FFTComplex *z)
{
vec_f vz0, vz1, vz2, vz3;
vec_f x0, x1, x2, x3;
vec_f x4, x5, x6, x7;
vec_f x8, x9, x10, x11;
vec_f x12, x13, x14, x15;
vec_f x16, x17, x18, x19;
vec_f x20, x21, x22, x23;
vec_f x24, x25, x26, x27;
vec_f x28, x29, x30, x31;
vec_f x32, x33, x34;
float* out= (float*)z;
vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
vz0 = vec_ld(0, &(out[0]));
vz1 = vec_ld(byte_2complex, &(out[0]));
vz2 = vec_ld(byte_4complex, &(out[0]));
vz3 = vec_ld(byte_6complex, &(out[0]));
x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
x4 = vec_add(x0, x1);
x5 = vec_sub(x0, x1);
x6 = vec_add(x2, x3);
x7 = vec_sub(x2, x3);
x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
x12 = vec_add(x8, x9);
x13 = vec_sub(x8, x9);
x14 = vec_add(x10, x11);
x15 = vec_sub(x10, x11);
x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i
x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i
x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
x24 = vec_add(x22, x23);
x25 = vec_sub(x22, x23);
x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i
x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i
x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i
x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i
x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i
x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i
x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i
x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i
vec_st(x29, 0, &(out[0]));
vec_st(x33, byte_2complex, &(out[0]));
vec_st(x31, byte_4complex, &(out[0]));
vec_st(x34, byte_6complex, &(out[0]));
}
inline static void fft16_vsx_interleave(FFTComplex *z)
{
float* out= (float*)z;
vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
vec_f vz0, vz1, vz2, vz3;
vec_f vz4, vz5, vz6, vz7;
vec_f x0, x1, x2, x3;
vec_f x4, x5, x6, x7;
vec_f x8, x9, x10, x11;
vec_f x12, x13, x14, x15;
vec_f x16, x17, x18, x19;
vec_f x20, x21, x22, x23;
vec_f x24, x25, x26, x27;
vec_f x28, x29, x30, x31;
vec_f x32, x33, x34, x35;
vec_f x36, x37, x38, x39;
vec_f x40, x41, x42, x43;
vec_f x44, x45, x46, x47;
vec_f x48, x49, x50, x51;
vec_f x52, x53, x54, x55;
vec_f x56, x57, x58, x59;
vec_f x60, x61, x62, x63;
vec_f x64, x65, x66, x67;
vec_f x68, x69, x70, x71;
vec_f x72, x73, x74, x75;
vec_f x76, x77, x78, x79;
vec_f x80, x81, x82, x83;
vec_f x84, x85, x86;
vz0 = vec_ld(0, &(out[0]));
vz1 = vec_ld(byte_2complex, &(out[0]));
vz2 = vec_ld(byte_4complex, &(out[0]));
vz3 = vec_ld(byte_6complex, &(out[0]));
vz4 = vec_ld(byte_8complex, &(out[0]));
vz5 = vec_ld(byte_10complex, &(out[0]));
vz6 = vec_ld(byte_12complex, &(out[0]));
vz7 = vec_ld(byte_14complex, &(out[0]));
x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
x8 = vec_add(x0, x1);
x9 = vec_sub(x0, x1);
x10 = vec_add(x2, x3);
x11 = vec_sub(x2, x3);
x12 = vec_add(x4, x5);
x13 = vec_sub(x4, x5);
x14 = vec_add(x6, x7);
x15 = vec_sub(x6, x7);
x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
x24 = vec_add(x16, x17);
x25 = vec_sub(x16, x17);
x26 = vec_add(x18, x19);
x27 = vec_sub(x18, x19);
x28 = vec_add(x20, x21);
x29 = vec_sub(x20, x21);
x30 = vec_add(x22, x23);
x31 = vec_sub(x22, x23);
x32 = vec_add(x24, x26);
x33 = vec_sub(x24, x26);
x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
x37 = vec_add(x35, x36);
x38 = vec_sub(x35, x36);
x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2));
x42 = vec_add(x40, x41);
x43 = vec_sub(x40, x41);
x44 = vec_mul(x42, vc0);
x45 = vec_mul(x43, vc0);
x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i
x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i
x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
x50 = vec_add(x48, x49);
x51 = vec_sub(x48, x49);
x52 = vec_mul(x50, vc1);
x53 = vec_mul(x50, vc2);
x54 = vec_mul(x51, vc1);
x55 = vec_mul(x51, vc2);
x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
x58 = vec_add(x56, x57);
x59 = vec_sub(x56, x57);
x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
x62 = vec_add(x52, x61);
x63 = vec_sub(x52, x61);
x64 = vec_add(x60, x53);
x65 = vec_sub(x60, x53);
x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i
x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i
x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i
x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i
x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
x73 = vec_add(x25, x72);
x74 = vec_sub(x25, x72);
x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i
x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i
x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i
x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i
x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i
x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i
vec_st(x79, 0, &(out[0]));
vec_st(x80, byte_2complex, &(out[0]));
vec_st(x81, byte_4complex, &(out[0]));
vec_st(x82, byte_6complex, &(out[0]));
x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i
x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i
x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i
x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i
vec_st(x83, byte_8complex, &(out[0]));
vec_st(x84, byte_10complex, &(out[0]));
vec_st(x85, byte_12complex, &(out[0]));
vec_st(x86, byte_14complex, &(out[0]));
}
inline static void fft4_vsx(FFTComplex *z)
{
vec_f a, b, c, d;
float* out= (float*)z;
a = vec_ld(0, &(out[0]));
b = vec_ld(byte_2complex, &(out[0]));
c = vec_perm(a, b, vcprm(0,1,s2,s1));
d = vec_perm(a, b, vcprm(2,3,s0,s3));
a = vec_add(c, d);
b = vec_sub(c, d);
c = vec_perm(a,b, vcprm(0,s0,1,s1));
d = vec_perm(a, b, vcprm(2,s3,3,s2));
a = vec_add(c, d);
b = vec_sub(c, d);
c = vec_perm(a, b, vcprm(0,1,s0,s1));
d = vec_perm(a, b, vcprm(2,3,s2,s3));
vec_st(c, 0, &(out[0]));
vec_st(d, byte_2complex, &(out[0]));
return;
}
inline static void fft8_vsx(FFTComplex *z)
{
vec_f vz0, vz1, vz2, vz3;
vec_f vz4, vz5, vz6, vz7, vz8;
float* out= (float*)z;
vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
vz0 = vec_ld(0, &(out[0]));
vz1 = vec_ld(byte_2complex, &(out[0]));
vz2 = vec_ld(byte_4complex, &(out[0]));
vz3 = vec_ld(byte_6complex, &(out[0]));
vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
vz2 = vec_add(vz6, vz7);
vz3 = vec_sub(vz6, vz7);
vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
vz0 = vec_add(vz4, vz5);
vz1 = vec_sub(vz4, vz5);
vz3 = vec_madd(vz3, vc1, vc0);
vz3 = vec_madd(vz8, vc2, vz3);
vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
vz0 = vec_add(vz4, vz5);
vz1 = vec_sub(vz4, vz5);
vz2 = vec_add(vz6, vz7);
vz3 = vec_sub(vz6, vz7);
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
vz2 = vec_sub(vz4, vz6);
vz3 = vec_sub(vz5, vz7);
vz0 = vec_add(vz4, vz6);
vz1 = vec_add(vz5, vz7);
vec_st(vz0, 0, &(out[0]));
vec_st(vz1, byte_2complex, &(out[0]));
vec_st(vz2, byte_4complex, &(out[0]));
vec_st(vz3, byte_6complex, &(out[0]));
return;
}
inline static void fft16_vsx(FFTComplex *z)
{
float* out= (float*)z;
vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
vec_f vz0, vz1, vz2, vz3;
vec_f vz4, vz5, vz6, vz7;
vec_f vz8, vz9, vz10, vz11;
vec_f vz12, vz13;
vz0 = vec_ld(byte_8complex, &(out[0]));
vz1 = vec_ld(byte_10complex, &(out[0]));
vz2 = vec_ld(byte_12complex, &(out[0]));
vz3 = vec_ld(byte_14complex, &(out[0]));
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
vz0 = vec_add(vz4, vz5);
vz1= vec_sub(vz4, vz5);
vz2 = vec_add(vz6, vz7);
vz3 = vec_sub(vz6, vz7);
vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
vz0 = vec_add(vz4, vz5);
vz1 = vec_sub(vz4, vz5);
vz2 = vec_add(vz6, vz7);
vz3 = vec_sub(vz6, vz7);
vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
vz0 = vec_ld(0, &(out[0]));
vz1 = vec_ld(byte_2complex, &(out[0]));
vz2 = vec_ld(byte_4complex, &(out[0]));
vz3 = vec_ld(byte_6complex, &(out[0]));
vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
vz2 = vec_add(vz10, vz11);
vz3 = vec_sub(vz10, vz11);
vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
vz0 = vec_add(vz8, vz9);
vz1 = vec_sub(vz8, vz9);
vz3 = vec_madd(vz3, vc1, vc0);
vz3 = vec_madd(vz12, vc2, vz3);
vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
vz0 = vec_add(vz8, vz9);
vz1 = vec_sub(vz8, vz9);
vz2 = vec_add(vz10, vz11);
vz3 = vec_sub(vz10, vz11);
vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
vz2 = vec_sub(vz8, vz10);
vz3 = vec_sub(vz9, vz11);
vz0 = vec_add(vz8, vz10);
vz1 = vec_add(vz9, vz11);
vz8 = vec_madd(vz4, vc3, vc0);
vz9 = vec_madd(vz5, vc3, vc0);
vz10 = vec_madd(vz6, vc3, vc0);
vz11 = vec_madd(vz7, vc3, vc0);
vz8 = vec_madd(vz5, vc4, vz8);
vz9 = vec_madd(vz4, vc5, vz9);
vz10 = vec_madd(vz7, vc5, vz10);
vz11 = vec_madd(vz6, vc4, vz11);
vz12 = vec_sub(vz10, vz8);
vz10 = vec_add(vz10, vz8);
vz13 = vec_sub(vz9, vz11);
vz11 = vec_add(vz9, vz11);
vz4 = vec_sub(vz0, vz10);
vz0 = vec_add(vz0, vz10);
vz7= vec_sub(vz3, vz12);
vz3= vec_add(vz3, vz12);
vz5 = vec_sub(vz1, vz11);
vz1 = vec_add(vz1, vz11);
vz6 = vec_sub(vz2, vz13);
vz2 = vec_add(vz2, vz13);
vec_st(vz0, 0, &(out[0]));
vec_st(vz1, byte_2complex, &(out[0]));
vec_st(vz2, byte_4complex, &(out[0]));
vec_st(vz3, byte_6complex, &(out[0]));
vec_st(vz4, byte_8complex, &(out[0]));
vec_st(vz5, byte_10complex, &(out[0]));
vec_st(vz6, byte_12complex, &(out[0]));
vec_st(vz7, byte_14complex, &(out[0]));
return;
}
inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
{
int o1 = n<<1;
int o2 = n<<2;
int o3 = o1+o2;
int i1, i2, i3;
FFTSample* out = (FFTSample*)z;
const FFTSample *wim = wre+o1;
vec_f v0, v1, v2, v3;
vec_f v4, v5, v6, v7;
vec_f v8, v9, v10, v11;
vec_f v12, v13;
n = n-2;
i1 = o1*sizeof(FFTComplex);
i2 = o2*sizeof(FFTComplex);
i3 = o3*sizeof(FFTComplex);
v8 = vec_ld(0, &(wre[0]));
v10 = vec_ld(0, &(wim[0]));
v9 = vec_ld(0, &(wim[-4]));
v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
v4 = vec_ld(i2, &(out[0]));
v5 = vec_ld(i2+16, &(out[0]));
v6 = vec_ld(i3, &(out[0]));
v7 = vec_ld(i3+16, &(out[0]));
v10 = vec_mul(v4, v8); // r2*wre
v11 = vec_mul(v5, v8); // i2*wre
v12 = vec_mul(v6, v8); // r3*wre
v13 = vec_mul(v7, v8); // i3*wre
v0 = vec_ld(0, &(out[0])); // r0
v3 = vec_ld(i1+16, &(out[0])); // i1
v10 = vec_madd(v5, v9, v10); // r2*wim
v11 = vec_nmsub(v4, v9, v11); // i2*wim
v12 = vec_nmsub(v7, v9, v12); // r3*wim
v13 = vec_madd(v6, v9, v13); // i3*wim
v1 = vec_ld(16, &(out[0])); // i0
v2 = vec_ld(i1, &(out[0])); // r1
v8 = vec_sub(v12, v10);
v12 = vec_add(v12, v10);
v9 = vec_sub(v11, v13);
v13 = vec_add(v11, v13);
v4 = vec_sub(v0, v12);
v0 = vec_add(v0, v12);
v7 = vec_sub(v3, v8);
v3 = vec_add(v3, v8);
vec_st(v0, 0, &(out[0])); // r0
vec_st(v3, i1+16, &(out[0])); // i1
vec_st(v4, i2, &(out[0])); // r2
vec_st(v7, i3+16, &(out[0]));// i3
v5 = vec_sub(v1, v13);
v1 = vec_add(v1, v13);
v6 = vec_sub(v2, v9);
v2 = vec_add(v2, v9);
vec_st(v1, 16, &(out[0])); // i0
vec_st(v2, i1, &(out[0])); // r1
vec_st(v5, i2+16, &(out[0])); // i2
vec_st(v6, i3, &(out[0])); // r3
do {
out += 8;
wre += 4;
wim -= 4;
v8 = vec_ld(0, &(wre[0]));
v10 = vec_ld(0, &(wim[0]));
v9 = vec_ld(0, &(wim[-4]));
v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
v4 = vec_ld(i2, &(out[0])); // r2
v5 = vec_ld(i2+16, &(out[0])); // i2
v6 = vec_ld(i3, &(out[0])); // r3
v7 = vec_ld(i3+16, &(out[0]));// i3
v10 = vec_mul(v4, v8); // r2*wre
v11 = vec_mul(v5, v8); // i2*wre
v12 = vec_mul(v6, v8); // r3*wre
v13 = vec_mul(v7, v8); // i3*wre
v0 = vec_ld(0, &(out[0])); // r0
v3 = vec_ld(i1+16, &(out[0])); // i1
v10 = vec_madd(v5, v9, v10); // r2*wim
v11 = vec_nmsub(v4, v9, v11); // i2*wim
v12 = vec_nmsub(v7, v9, v12); // r3*wim
v13 = vec_madd(v6, v9, v13); // i3*wim
v1 = vec_ld(16, &(out[0])); // i0
v2 = vec_ld(i1, &(out[0])); // r1
v8 = vec_sub(v12, v10);
v12 = vec_add(v12, v10);
v9 = vec_sub(v11, v13);
v13 = vec_add(v11, v13);
v4 = vec_sub(v0, v12);
v0 = vec_add(v0, v12);
v7 = vec_sub(v3, v8);
v3 = vec_add(v3, v8);
vec_st(v0, 0, &(out[0])); // r0
vec_st(v3, i1+16, &(out[0])); // i1
vec_st(v4, i2, &(out[0])); // r2
vec_st(v7, i3+16, &(out[0])); // i3
v5 = vec_sub(v1, v13);
v1 = vec_add(v1, v13);
v6 = vec_sub(v2, v9);
v2 = vec_add(v2, v9);
vec_st(v1, 16, &(out[0])); // i0
vec_st(v2, i1, &(out[0])); // r1
vec_st(v5, i2+16, &(out[0])); // i2
vec_st(v6, i3, &(out[0])); // r3
} while (n-=2);
}
#endif
#endif /* AVCODEC_PPC_FFT_VSX_H */

View File

@ -1,120 +0,0 @@
/*
* (I)RDFT transforms
* Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdlib.h>
#include <math.h>
#include "libavutil/error.h"
#include "libavutil/mathematics.h"
#include "rdft.h"
/**
* @file
* (Inverse) Real Discrete Fourier Transforms.
*/
/** Map one real FFT into two parallel real even and odd FFTs. Then interleave
* the two real FFTs into one complex FFT. Unmangle the results.
* ref: http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM
*/
static void rdft_calc_c(RDFTContext *s, FFTSample *data)
{
int i, i1, i2;
FFTComplex ev, od, odsum;
const int n = 1 << s->nbits;
const float k1 = 0.5;
const float k2 = 0.5 - s->inverse;
const FFTSample *tcos = s->tcos;
const FFTSample *tsin = s->tsin;
if (!s->inverse) {
s->fft.fft_permute(&s->fft, (FFTComplex*)data);
s->fft.fft_calc(&s->fft, (FFTComplex*)data);
}
/* i=0 is a special case because of packing, the DC term is real, so we
are going to throw the N/2 term (also real) in with it. */
ev.re = data[0];
data[0] = ev.re+data[1];
data[1] = ev.re-data[1];
#define RDFT_UNMANGLE(sign0, sign1) \
for (i = 1; i < (n>>2); i++) { \
i1 = 2*i; \
i2 = n-i1; \
/* Separate even and odd FFTs */ \
ev.re = k1*(data[i1 ]+data[i2 ]); \
od.im = k2*(data[i2 ]-data[i1 ]); \
ev.im = k1*(data[i1+1]-data[i2+1]); \
od.re = k2*(data[i1+1]+data[i2+1]); \
/* Apply twiddle factors to the odd FFT and add to the even FFT */ \
odsum.re = od.re*tcos[i] sign0 od.im*tsin[i]; \
odsum.im = od.im*tcos[i] sign1 od.re*tsin[i]; \
data[i1 ] = ev.re + odsum.re; \
data[i1+1] = ev.im + odsum.im; \
data[i2 ] = ev.re - odsum.re; \
data[i2+1] = odsum.im - ev.im; \
}
if (s->negative_sin) {
RDFT_UNMANGLE(+,-)
} else {
RDFT_UNMANGLE(-,+)
}
data[2*i+1]=s->sign_convention*data[2*i+1];
if (s->inverse) {
data[0] *= k1;
data[1] *= k1;
s->fft.fft_permute(&s->fft, (FFTComplex*)data);
s->fft.fft_calc(&s->fft, (FFTComplex*)data);
}
}
av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans)
{
int n = 1 << nbits;
int ret;
s->nbits = nbits;
s->inverse = trans == IDFT_C2R || trans == DFT_C2R;
s->sign_convention = trans == IDFT_R2C || trans == DFT_C2R ? 1 : -1;
s->negative_sin = trans == DFT_C2R || trans == DFT_R2C;
if (nbits < 4 || nbits > 16)
return AVERROR(EINVAL);
if ((ret = ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C)) < 0)
return ret;
ff_init_ff_cos_tabs(nbits);
s->tcos = ff_cos_tabs[nbits];
s->tsin = ff_cos_tabs[nbits] + (n >> 2);
s->rdft_calc = rdft_calc_c;
#if ARCH_ARM
ff_rdft_init_arm(s);
#endif
return 0;
}
av_cold void ff_rdft_end(RDFTContext *s)
{
ff_fft_end(&s->fft);
}

View File

@ -1,52 +0,0 @@
/*
* (I)RDFT transforms
* Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#if !defined(AVCODEC_RDFT_H) && (!defined(FFT_FLOAT) || FFT_FLOAT)
#define AVCODEC_RDFT_H
#include "config.h"
#include "fft.h"
struct RDFTContext {
int nbits;
int inverse;
int sign_convention;
/* pre/post rotation tables */
const FFTSample *tcos;
const FFTSample *tsin;
int negative_sin;
FFTContext fft;
void (*rdft_calc)(struct RDFTContext *s, FFTSample *z);
};
/**
* Set up a real FFT.
* @param nbits log2 of the length of the input array
* @param trans the type of transform
*/
int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans);
void ff_rdft_end(RDFTContext *s);
void ff_rdft_init_arm(RDFTContext *s);
#endif /* AVCODEC_RDFT_H */

View File

@ -1,6 +1,5 @@
/av1_levels
/avcodec
/avfft
/avpacket
/bitstream_be
/bitstream_le
@ -8,8 +7,6 @@
/celp_math
/codec_desc
/dct
/fft
/fft-fixed32
/golomb
/h264_levels
/h265_levels

View File

@ -1,25 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* This test is similar to fft-fixed.c or fft-fixed32.c
*/
#define AVFFT 1
#define FFT_FLOAT 1
#include "fft.c"

View File

@ -1,21 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define FFT_FLOAT 0
#define AVFFT 0
#include "fft.c"

View File

@ -1,683 +0,0 @@
/*
* (c) 2002 Fabrice Bellard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/internal.h"
FF_DISABLE_DEPRECATION_WARNINGS
/**
* @file
* FFT and MDCT tests.
*/
#include "config.h"
#ifndef AVFFT
#define AVFFT 0
#endif
#include <math.h>
#if HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "libavutil/cpu.h"
#include "libavutil/error.h"
#include "libavutil/lfg.h"
#include "libavutil/log.h"
#include "libavutil/mathematics.h"
#include "libavutil/time.h"
#if AVFFT
#include "libavcodec/avfft.h"
#else
#include "libavcodec/fft.h"
#endif
#if FFT_FLOAT
#include "libavcodec/dct.h"
#include "libavcodec/rdft.h"
#endif
/* reference fft */
#define MUL16(a, b) ((a) * (b))
#define CMAC(pre, pim, are, aim, bre, bim) \
{ \
pre += (MUL16(are, bre) - MUL16(aim, bim)); \
pim += (MUL16(are, bim) + MUL16(bre, aim)); \
}
#if FFT_FLOAT || AVFFT
#define RANGE 1.0
#define REF_SCALE(x, bits) (x)
#define FMT "%10.6f"
#else
#define RANGE 8388608
#define REF_SCALE(x, bits) (x)
#define FMT "%6d"
#endif
static struct {
float re, im;
} *exptab;
static int fft_ref_init(int nbits, int inverse)
{
int i, n = 1 << nbits;
exptab = av_malloc_array((n / 2), sizeof(*exptab));
if (!exptab)
return AVERROR(ENOMEM);
for (i = 0; i < (n / 2); i++) {
double alpha = 2 * M_PI * (float) i / (float) n;
double c1 = cos(alpha), s1 = sin(alpha);
if (!inverse)
s1 = -s1;
exptab[i].re = c1;
exptab[i].im = s1;
}
return 0;
}
static void fft_ref(FFTComplex *tabr, FFTComplex *tab, int nbits)
{
int i, j;
int n = 1 << nbits;
int n2 = n >> 1;
for (i = 0; i < n; i++) {
double tmp_re = 0, tmp_im = 0;
FFTComplex *q = tab;
for (j = 0; j < n; j++) {
double s, c;
int k = (i * j) & (n - 1);
if (k >= n2) {
c = -exptab[k - n2].re;
s = -exptab[k - n2].im;
} else {
c = exptab[k].re;
s = exptab[k].im;
}
CMAC(tmp_re, tmp_im, c, s, q->re, q->im);
q++;
}
tabr[i].re = REF_SCALE(tmp_re, nbits);
tabr[i].im = REF_SCALE(tmp_im, nbits);
}
}
#if CONFIG_MDCT
static void imdct_ref(FFTSample *out, FFTSample *in, int nbits)
{
int i, k, n = 1 << nbits;
for (i = 0; i < n; i++) {
double sum = 0;
for (k = 0; k < n / 2; k++) {
int a = (2 * i + 1 + (n / 2)) * (2 * k + 1);
double f = cos(M_PI * a / (double) (2 * n));
sum += f * in[k];
}
out[i] = REF_SCALE(-sum, nbits - 2);
}
}
/* NOTE: no normalisation by 1 / N is done */
static void mdct_ref(FFTSample *output, FFTSample *input, int nbits)
{
int i, k, n = 1 << nbits;
/* do it by hand */
for (k = 0; k < n / 2; k++) {
double s = 0;
for (i = 0; i < n; i++) {
double a = (2 * M_PI * (2 * i + 1 + n / 2) * (2 * k + 1) / (4 * n));
s += input[i] * cos(a);
}
output[k] = REF_SCALE(s, nbits - 1);
}
}
#endif /* CONFIG_MDCT */
#if FFT_FLOAT
#if CONFIG_DCT
static void idct_ref(FFTSample *output, FFTSample *input, int nbits)
{
int i, k, n = 1 << nbits;
/* do it by hand */
for (i = 0; i < n; i++) {
double s = 0.5 * input[0];
for (k = 1; k < n; k++) {
double a = M_PI * k * (i + 0.5) / n;
s += input[k] * cos(a);
}
output[i] = 2 * s / n;
}
}
static void dct_ref(FFTSample *output, FFTSample *input, int nbits)
{
int i, k, n = 1 << nbits;
/* do it by hand */
for (k = 0; k < n; k++) {
double s = 0;
for (i = 0; i < n; i++) {
double a = M_PI * k * (i + 0.5) / n;
s += input[i] * cos(a);
}
output[k] = s;
}
}
#endif /* CONFIG_DCT */
#endif /* FFT_FLOAT */
static FFTSample frandom(AVLFG *prng)
{
return (int16_t) av_lfg_get(prng) / 32768.0 * RANGE;
}
static int check_diff(FFTSample *tab1, FFTSample *tab2, int n, double scale)
{
int i, err = 0;
double error = 0, max = 0;
for (i = 0; i < n; i++) {
double e = fabs(tab1[i] - (tab2[i] / scale)) / RANGE;
if (e >= 1e-3) {
av_log(NULL, AV_LOG_ERROR, "ERROR %5d: "FMT" "FMT"\n",
i, tab1[i], tab2[i]);
err = 1;
}
error += e * e;
if (e > max)
max = e;
}
av_log(NULL, AV_LOG_INFO, "max:%f e:%g\n", max, sqrt(error / n));
return err;
}
static inline void fft_init(FFTContext **s, int nbits, int inverse)
{
#if AVFFT
*s = av_fft_init(nbits, inverse);
#else
ff_fft_init(*s, nbits, inverse);
#endif
}
#if CONFIG_MDCT
static inline void mdct_init(FFTContext **s, int nbits, int inverse, double scale)
{
#if AVFFT
*s = av_mdct_init(nbits, inverse, scale);
#else
ff_mdct_init(*s, nbits, inverse, scale);
#endif
}
static inline void mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input)
{
#if AVFFT
av_mdct_calc(s, output, input);
#else
s->mdct_calc(s, output, input);
#endif
}
static inline void imdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input)
{
#if AVFFT
av_imdct_calc(s, output, input);
#else
s->imdct_calc(s, output, input);
#endif
}
#endif
static inline void fft_permute(FFTContext *s, FFTComplex *z)
{
#if AVFFT
av_fft_permute(s, z);
#else
s->fft_permute(s, z);
#endif
}
static inline void fft_calc(FFTContext *s, FFTComplex *z)
{
#if AVFFT
av_fft_calc(s, z);
#else
s->fft_calc(s, z);
#endif
}
static inline void mdct_end(FFTContext *s)
{
#if AVFFT
av_mdct_end(s);
#else
ff_mdct_end(s);
#endif
}
static inline void fft_end(FFTContext *s)
{
#if AVFFT
av_fft_end(s);
#else
ff_fft_end(s);
#endif
}
#if FFT_FLOAT
static inline void rdft_init(RDFTContext **r, int nbits, enum RDFTransformType trans)
{
#if AVFFT
*r = av_rdft_init(nbits, trans);
#else
ff_rdft_init(*r, nbits, trans);
#endif
}
static inline void dct_init(DCTContext **d, int nbits, enum DCTTransformType trans)
{
#if AVFFT
*d = av_dct_init(nbits, trans);
#else
ff_dct_init(*d, nbits, trans);
#endif
}
static inline void rdft_calc(RDFTContext *r, FFTSample *tab)
{
#if AVFFT
av_rdft_calc(r, tab);
#else
r->rdft_calc(r, tab);
#endif
}
static inline void dct_calc(DCTContext *d, FFTSample *data)
{
#if AVFFT
av_dct_calc(d, data);
#else
d->dct_calc(d, data);
#endif
}
static inline void rdft_end(RDFTContext *r)
{
#if AVFFT
av_rdft_end(r);
#else
ff_rdft_end(r);
#endif
}
static inline void dct_end(DCTContext *d)
{
#if AVFFT
av_dct_end(d);
#else
ff_dct_end(d);
#endif
}
#endif /* FFT_FLOAT */
static void help(void)
{
av_log(NULL, AV_LOG_INFO,
"usage: fft-test [-h] [-s] [-i] [-n b]\n"
"-h print this help\n"
"-s speed test\n"
"-m (I)MDCT test\n"
"-d (I)DCT test\n"
"-r (I)RDFT test\n"
"-i inverse transform test\n"
"-n b set the transform size to 2^b\n"
"-f x set scale factor for output data of (I)MDCT to x\n");
}
enum tf_transform {
TRANSFORM_FFT,
TRANSFORM_MDCT,
TRANSFORM_RDFT,
TRANSFORM_DCT,
};
#if !HAVE_GETOPT
#include "compat/getopt.c"
#endif
int main(int argc, char **argv)
{
FFTComplex *tab, *tab1, *tab_ref;
FFTSample *tab2;
enum tf_transform transform = TRANSFORM_FFT;
FFTContext *m, *s;
#if FFT_FLOAT
RDFTContext *r;
DCTContext *d;
#endif /* FFT_FLOAT */
int it, i, err = 1;
int do_speed = 0, do_inverse = 0;
int fft_nbits = 9, fft_size;
double scale = 1.0;
AVLFG prng;
#if !AVFFT
s = av_mallocz(sizeof(*s));
m = av_mallocz(sizeof(*m));
#endif
#if !AVFFT && FFT_FLOAT
r = av_mallocz(sizeof(*r));
d = av_mallocz(sizeof(*d));
#endif
av_lfg_init(&prng, 1);
for (;;) {
int c = getopt(argc, argv, "hsimrdn:f:c:");
if (c == -1)
break;
switch (c) {
case 'h':
help();
return 1;
case 's':
do_speed = 1;
break;
case 'i':
do_inverse = 1;
break;
case 'm':
transform = TRANSFORM_MDCT;
break;
case 'r':
transform = TRANSFORM_RDFT;
break;
case 'd':
transform = TRANSFORM_DCT;
break;
case 'n':
fft_nbits = atoi(optarg);
break;
case 'f':
scale = atof(optarg);
break;
case 'c':
{
unsigned cpuflags = av_get_cpu_flags();
if (av_parse_cpu_caps(&cpuflags, optarg) < 0)
return 1;
av_force_cpu_flags(cpuflags);
break;
}
}
}
fft_size = 1 << fft_nbits;
tab = av_malloc_array(fft_size, sizeof(FFTComplex));
tab1 = av_malloc_array(fft_size, sizeof(FFTComplex));
tab_ref = av_malloc_array(fft_size, sizeof(FFTComplex));
tab2 = av_malloc_array(fft_size, sizeof(FFTSample));
if (!(tab && tab1 && tab_ref && tab2))
goto cleanup;
switch (transform) {
#if CONFIG_MDCT
case TRANSFORM_MDCT:
av_log(NULL, AV_LOG_INFO, "Scale factor is set to %f\n", scale);
if (do_inverse)
av_log(NULL, AV_LOG_INFO, "IMDCT");
else
av_log(NULL, AV_LOG_INFO, "MDCT");
mdct_init(&m, fft_nbits, do_inverse, scale);
break;
#endif /* CONFIG_MDCT */
case TRANSFORM_FFT:
if (do_inverse)
av_log(NULL, AV_LOG_INFO, "IFFT");
else
av_log(NULL, AV_LOG_INFO, "FFT");
fft_init(&s, fft_nbits, do_inverse);
if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0)
goto cleanup;
break;
#if FFT_FLOAT
# if CONFIG_RDFT
case TRANSFORM_RDFT:
if (do_inverse)
av_log(NULL, AV_LOG_INFO, "IDFT_C2R");
else
av_log(NULL, AV_LOG_INFO, "DFT_R2C");
rdft_init(&r, fft_nbits, do_inverse ? IDFT_C2R : DFT_R2C);
if ((err = fft_ref_init(fft_nbits, do_inverse)) < 0)
goto cleanup;
break;
# endif /* CONFIG_RDFT */
# if CONFIG_DCT
case TRANSFORM_DCT:
if (do_inverse)
av_log(NULL, AV_LOG_INFO, "DCT_III");
else
av_log(NULL, AV_LOG_INFO, "DCT_II");
dct_init(&d, fft_nbits, do_inverse ? DCT_III : DCT_II);
break;
# endif /* CONFIG_DCT */
#endif /* FFT_FLOAT */
default:
av_log(NULL, AV_LOG_ERROR, "Requested transform not supported\n");
goto cleanup;
}
av_log(NULL, AV_LOG_INFO, " %d test\n", fft_size);
/* generate random data */
for (i = 0; i < fft_size; i++) {
tab1[i].re = frandom(&prng);
tab1[i].im = frandom(&prng);
}
/* checking result */
av_log(NULL, AV_LOG_INFO, "Checking...\n");
switch (transform) {
#if CONFIG_MDCT
case TRANSFORM_MDCT:
if (do_inverse) {
imdct_ref(&tab_ref->re, &tab1->re, fft_nbits);
imdct_calc(m, tab2, &tab1->re);
err = check_diff(&tab_ref->re, tab2, fft_size, scale);
} else {
mdct_ref(&tab_ref->re, &tab1->re, fft_nbits);
mdct_calc(m, tab2, &tab1->re);
err = check_diff(&tab_ref->re, tab2, fft_size / 2, scale);
}
break;
#endif /* CONFIG_MDCT */
case TRANSFORM_FFT:
memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
fft_permute(s, tab);
fft_calc(s, tab);
fft_ref(tab_ref, tab1, fft_nbits);
err = check_diff(&tab_ref->re, &tab->re, fft_size * 2, 1.0);
break;
#if FFT_FLOAT
#if CONFIG_RDFT
case TRANSFORM_RDFT:
{
int fft_size_2 = fft_size >> 1;
if (do_inverse) {
tab1[0].im = 0;
tab1[fft_size_2].im = 0;
for (i = 1; i < fft_size_2; i++) {
tab1[fft_size_2 + i].re = tab1[fft_size_2 - i].re;
tab1[fft_size_2 + i].im = -tab1[fft_size_2 - i].im;
}
memcpy(tab2, tab1, fft_size * sizeof(FFTSample));
tab2[1] = tab1[fft_size_2].re;
rdft_calc(r, tab2);
fft_ref(tab_ref, tab1, fft_nbits);
for (i = 0; i < fft_size; i++) {
tab[i].re = tab2[i];
tab[i].im = 0;
}
err = check_diff(&tab_ref->re, &tab->re, fft_size * 2, 0.5);
} else {
for (i = 0; i < fft_size; i++) {
tab2[i] = tab1[i].re;
tab1[i].im = 0;
}
rdft_calc(r, tab2);
fft_ref(tab_ref, tab1, fft_nbits);
tab_ref[0].im = tab_ref[fft_size_2].re;
err = check_diff(&tab_ref->re, tab2, fft_size, 1.0);
}
break;
}
#endif /* CONFIG_RDFT */
#if CONFIG_DCT
case TRANSFORM_DCT:
memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
dct_calc(d, &tab->re);
if (do_inverse)
idct_ref(&tab_ref->re, &tab1->re, fft_nbits);
else
dct_ref(&tab_ref->re, &tab1->re, fft_nbits);
err = check_diff(&tab_ref->re, &tab->re, fft_size, 1.0);
break;
#endif /* CONFIG_DCT */
#endif /* FFT_FLOAT */
}
/* do a speed test */
if (do_speed) {
int64_t time_start, duration;
int nb_its;
av_log(NULL, AV_LOG_INFO, "Speed test...\n");
/* we measure during about 1 seconds */
nb_its = 1;
for (;;) {
time_start = av_gettime_relative();
for (it = 0; it < nb_its; it++) {
switch (transform) {
#if CONFIG_MDCT
case TRANSFORM_MDCT:
if (do_inverse)
imdct_calc(m, &tab->re, &tab1->re);
else
mdct_calc(m, &tab->re, &tab1->re);
break;
#endif
case TRANSFORM_FFT:
memcpy(tab, tab1, fft_size * sizeof(FFTComplex));
fft_calc(s, tab);
break;
#if FFT_FLOAT
case TRANSFORM_RDFT:
memcpy(tab2, tab1, fft_size * sizeof(FFTSample));
rdft_calc(r, tab2);
break;
case TRANSFORM_DCT:
memcpy(tab2, tab1, fft_size * sizeof(FFTSample));
dct_calc(d, tab2);
break;
#endif /* FFT_FLOAT */
}
}
duration = av_gettime_relative() - time_start;
if (duration >= 1000000)
break;
nb_its *= 2;
}
av_log(NULL, AV_LOG_INFO,
"time: %0.1f us/transform [total time=%0.2f s its=%d]\n",
(double) duration / nb_its,
(double) duration / 1000000.0,
nb_its);
}
switch (transform) {
#if CONFIG_MDCT
case TRANSFORM_MDCT:
mdct_end(m);
break;
#endif /* CONFIG_MDCT */
case TRANSFORM_FFT:
fft_end(s);
break;
#if FFT_FLOAT
# if CONFIG_RDFT
case TRANSFORM_RDFT:
rdft_end(r);
break;
# endif /* CONFIG_RDFT */
# if CONFIG_DCT
case TRANSFORM_DCT:
dct_end(d);
break;
# endif /* CONFIG_DCT */
#endif /* FFT_FLOAT */
}
cleanup:
av_free(tab);
av_free(tab1);
av_free(tab2);
av_free(tab_ref);
av_free(exptab);
#if !AVFFT
av_free(s);
av_free(m);
#endif
#if !AVFFT && FFT_FLOAT
av_free(r);
av_free(d);
#endif
if (err)
printf("Error: %d.\n", err);
return !!err;
}
FF_ENABLE_DEPRECATION_WARNINGS

View File

@ -5,11 +5,9 @@ OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_init.o
OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o
OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o
OBJS-$(CONFIG_DCT) += x86/dct_init.o
OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \
x86/dirac_dwt_init.o
OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o
OBJS-$(CONFIG_FFT) += x86/fft_init.o
OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert_init.o
OBJS-$(CONFIG_H263DSP) += x86/h263dsp_init.o
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
@ -98,8 +96,6 @@ X86ASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o \
X86ASM-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp.o
X86ASM-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp.o
X86ASM-OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp.o
X86ASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
X86ASM-OBJS-$(CONFIG_FFT) += x86/fft.o
X86ASM-OBJS-$(CONFIG_FMTCONVERT) += x86/fmtconvert.o
X86ASM-OBJS-$(CONFIG_H263DSP) += x86/h263_loopfilter.o
X86ASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \

View File

@ -1,36 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dct.h"
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
av_cold void ff_dct_init_x86(DCTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_SSE2(cpu_flags))
s->dct32 = ff_dct32_float_sse2;
if (EXTERNAL_AVX_FAST(cpu_flags))
s->dct32 = ff_dct32_float_avx;
}

View File

@ -1,838 +0,0 @@
;******************************************************************************
;* FFT transform with SSE/AVX optimizations
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2011 Vitor Sessak
;*
;* This algorithm (though not any of the implementation details) is
;* based on libdjbfft by D. J. Bernstein.
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
; These functions are not individually interchangeable with the C versions.
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
; in blocks as conventient to the vector size.
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
%include "libavutil/x86/x86util.asm"
%if ARCH_X86_64
%define pointer resq
%else
%define pointer resd
%endif
struc FFTContext
.nbits: resd 1
.reverse: resd 1
.revtab: pointer 1
.tmpbuf: pointer 1
.mdctsize: resd 1
.mdctbits: resd 1
.tcos: pointer 1
.tsin: pointer 1
.fftperm: pointer 1
.fftcalc: pointer 1
.imdctcalc:pointer 1
.imdcthalf:pointer 1
endstruc
SECTION_RODATA 32
%define M_SQRT1_2 0.70710678118654752440
%define M_COS_PI_1_8 0.923879532511287
%define M_COS_PI_3_8 0.38268343236509
ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
ps_root2: times 8 dd M_SQRT1_2
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
ps_m1p1: dd 1<<31, 0
cextern ps_neg
%assign i 16
%rep 14
cextern cos_ %+ i
%assign i i<<1
%endrep
%if ARCH_X86_64
%define pointer dq
%else
%define pointer dd
%endif
%macro IF0 1+
%endmacro
%macro IF1 1+
%1
%endmacro
SECTION .text
; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
; %3, %4, %5 tmp
; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
%macro T8_AVX 5
vsubps %5, %1, %2 ; v = %1 - %2
vaddps %3, %1, %2 ; w = %1 + %2
vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
vpermilps %2, %2, [perm1]
vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
vsubps %4, %5, %1 ; s = r - q
vaddps %1, %5, %1 ; u = r + q
vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
vshufps %5, %4, %1, 0xbb
vshufps %3, %4, %1, 0xee
vperm2f128 %3, %3, %5, 0x13
vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
vshufps %2, %1, %4, 0xdd
vshufps %1, %1, %4, 0x88
vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
vsubps %5, %1, %3
vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
vsubps %2, %4, %1 ; %2 = v - w
vaddps %1, %4, %1 ; %1 = v + w
%endmacro
; In SSE mode do one fft4 transforms
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
;
; In AVX mode do two fft4 transforms
; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
%macro T4_SSE 3
subps %3, %1, %2 ; {t3,t4,-t8,t7}
addps %1, %1, %2 ; {t1,t2,t6,t5}
xorps %3, %3, [ps_p1p1m1p1]
shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
subps %3, %1, %2 ; {r2,i2,r3,i3}
addps %1, %1, %2 ; {r0,i0,r1,i1}
shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
%endmacro
; In SSE mode do one FFT8
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
;
; In AVX mode do two FFT8
; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
%macro T8_SSE 6
addps %6, %3, %4 ; {t1,t2,t3,t4}
subps %3, %3, %4 ; {r5,i5,r7,i7}
shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
mulps %4, %4, [ps_root2]
addps %3, %3, %4 ; {t8,t7,ta,t9}
shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
subps %3, %6, %4 ; {t6,t5,tc,tb}
addps %6, %6, %4 ; {t1,t2,t9,ta}
shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
subps %3, %1, %6 ; {r4,r5,r6,r7}
addps %1, %1, %6 ; {r0,r1,r2,r3}
subps %4, %2, %5 ; {i4,i5,i6,i7}
addps %2, %2, %5 ; {i0,i1,i2,i3}
%endmacro
%macro INTERL 5
%if cpuflag(avx)
vunpckhps %3, %2, %1
vunpcklps %2, %2, %1
vextractf128 %4(%5), %2, 0
vextractf128 %4 %+ H(%5), %3, 0
vextractf128 %4(%5 + 1), %2, 1
vextractf128 %4 %+ H(%5 + 1), %3, 1
%elif cpuflag(sse)
mova %3, %2
unpcklps %2, %1
unpckhps %3, %1
mova %4(%5), %2
mova %4(%5+1), %3
%endif
%endmacro
; scheduled for cpu-bound sizes
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
IF%1 mova m4, Z(4)
IF%1 mova m5, Z(5)
mova m0, %2 ; wre
mova m1, %3 ; wim
mulps m2, m4, m0 ; r2*wre
IF%1 mova m6, Z2(6)
mulps m3, m5, m1 ; i2*wim
IF%1 mova m7, Z2(7)
mulps m4, m4, m1 ; r2*wim
mulps m5, m5, m0 ; i2*wre
addps m2, m2, m3 ; r2*wre + i2*wim
mulps m3, m1, m7 ; i3*wim
subps m5, m5, m4 ; i2*wre - r2*wim
mulps m1, m1, m6 ; r3*wim
mulps m4, m0, m6 ; r3*wre
mulps m0, m0, m7 ; i3*wre
subps m4, m4, m3 ; r3*wre - i3*wim
mova m3, Z(0)
addps m0, m0, m1 ; i3*wre + r3*wim
subps m1, m4, m2 ; t3
addps m4, m4, m2 ; t5
subps m3, m3, m4 ; r2
addps m4, m4, Z(0) ; r0
mova m6, Z(2)
mova Z(4), m3
mova Z(0), m4
subps m3, m5, m0 ; t4
subps m4, m6, m3 ; r3
addps m3, m3, m6 ; r1
mova Z2(6), m4
mova Z(2), m3
mova m2, Z(3)
addps m3, m5, m0 ; t6
subps m2, m2, m1 ; i3
mova m7, Z(1)
addps m1, m1, Z(3) ; i1
mova Z2(7), m2
mova Z(3), m1
subps m4, m7, m3 ; i2
addps m3, m3, m7 ; i0
mova Z(5), m4
mova Z(1), m3
%endmacro
; scheduled to avoid store->load aliasing
%macro PASS_BIG 1 ; (!interleave)
mova m4, Z(4) ; r2
mova m5, Z(5) ; i2
mova m0, [wq] ; wre
mova m1, [wq+o1q] ; wim
mulps m2, m4, m0 ; r2*wre
mova m6, Z2(6) ; r3
mulps m3, m5, m1 ; i2*wim
mova m7, Z2(7) ; i3
mulps m4, m4, m1 ; r2*wim
mulps m5, m5, m0 ; i2*wre
addps m2, m2, m3 ; r2*wre + i2*wim
mulps m3, m1, m7 ; i3*wim
mulps m1, m1, m6 ; r3*wim
subps m5, m5, m4 ; i2*wre - r2*wim
mulps m4, m0, m6 ; r3*wre
mulps m0, m0, m7 ; i3*wre
subps m4, m4, m3 ; r3*wre - i3*wim
mova m3, Z(0)
addps m0, m0, m1 ; i3*wre + r3*wim
subps m1, m4, m2 ; t3
addps m4, m4, m2 ; t5
subps m3, m3, m4 ; r2
addps m4, m4, Z(0) ; r0
mova m6, Z(2)
mova Z(4), m3
mova Z(0), m4
subps m3, m5, m0 ; t4
subps m4, m6, m3 ; r3
addps m3, m3, m6 ; r1
IF%1 mova Z2(6), m4
IF%1 mova Z(2), m3
mova m2, Z(3)
addps m5, m5, m0 ; t6
subps m2, m2, m1 ; i3
mova m7, Z(1)
addps m1, m1, Z(3) ; i1
IF%1 mova Z2(7), m2
IF%1 mova Z(3), m1
subps m6, m7, m5 ; i2
addps m5, m5, m7 ; i0
IF%1 mova Z(5), m6
IF%1 mova Z(1), m5
%if %1==0
INTERL m1, m3, m7, Z, 2
INTERL m2, m4, m0, Z2, 6
mova m1, Z(0)
mova m2, Z(4)
INTERL m5, m1, m3, Z, 0
INTERL m6, m2, m7, Z, 4
%endif
%endmacro
%define Z(x) [r0+mmsize*x]
%define Z2(x) [r0+mmsize*x]
%define ZH(x) [r0+mmsize*x+mmsize/2]
INIT_YMM avx
%if HAVE_AVX_EXTERNAL
align 16
fft8_avx:
mova m0, Z(0)
mova m1, Z(1)
T8_AVX m0, m1, m2, m3, m4
mova Z(0), m0
mova Z(1), m1
ret
align 16
fft16_avx:
mova m2, Z(2)
mova m3, Z(3)
T4_SSE m2, m3, m7
mova m0, Z(0)
mova m1, Z(1)
T8_AVX m0, m1, m4, m5, m7
mova m4, [ps_cos16_1]
mova m5, [ps_cos16_2]
vmulps m6, m2, m4
vmulps m7, m3, m5
vaddps m7, m7, m6
vmulps m2, m2, m5
vmulps m3, m3, m4
vsubps m3, m3, m2
vblendps m2, m7, m3, 0xf0
vperm2f128 m3, m7, m3, 0x21
vaddps m4, m2, m3
vsubps m2, m3, m2
vperm2f128 m2, m2, m2, 0x01
vsubps m3, m1, m2
vaddps m1, m1, m2
vsubps m5, m0, m4
vaddps m0, m0, m4
vextractf128 Z(0), m0, 0
vextractf128 ZH(0), m1, 0
vextractf128 Z(1), m0, 1
vextractf128 ZH(1), m1, 1
vextractf128 Z(2), m5, 0
vextractf128 ZH(2), m3, 0
vextractf128 Z(3), m5, 1
vextractf128 ZH(3), m3, 1
ret
align 16
fft32_avx:
call fft16_avx
mova m0, Z(4)
mova m1, Z(5)
T4_SSE m0, m1, m4
mova m2, Z(6)
mova m3, Z(7)
T8_SSE m0, m1, m2, m3, m4, m6
; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
vperm2f128 m4, m0, m2, 0x20
vperm2f128 m5, m1, m3, 0x20
vperm2f128 m6, m0, m2, 0x31
vperm2f128 m7, m1, m3, 0x31
PASS_SMALL 0, [cos_32], [cos_32+32]
ret
fft32_interleave_avx:
call fft32_avx
mov r2d, 32
.deint_loop:
mova m2, Z(0)
mova m3, Z(1)
vunpcklps m0, m2, m3
vunpckhps m1, m2, m3
vextractf128 Z(0), m0, 0
vextractf128 ZH(0), m1, 0
vextractf128 Z(1), m0, 1
vextractf128 ZH(1), m1, 1
add r0, mmsize*2
sub r2d, mmsize/4
jg .deint_loop
ret
%endif
INIT_XMM sse
align 16
fft4_avx:
fft4_sse:
mova m0, Z(0)
mova m1, Z(1)
T4_SSE m0, m1, m2
mova Z(0), m0
mova Z(1), m1
ret
align 16
fft8_sse:
mova m0, Z(0)
mova m1, Z(1)
T4_SSE m0, m1, m2
mova m2, Z(2)
mova m3, Z(3)
T8_SSE m0, m1, m2, m3, m4, m5
mova Z(0), m0
mova Z(1), m1
mova Z(2), m2
mova Z(3), m3
ret
align 16
fft16_sse:
mova m0, Z(0)
mova m1, Z(1)
T4_SSE m0, m1, m2
mova m2, Z(2)
mova m3, Z(3)
T8_SSE m0, m1, m2, m3, m4, m5
mova m4, Z(4)
mova m5, Z(5)
mova Z(0), m0
mova Z(1), m1
mova Z(2), m2
mova Z(3), m3
T4_SSE m4, m5, m6
mova m6, Z2(6)
mova m7, Z2(7)
T4_SSE m6, m7, m0
PASS_SMALL 0, [cos_16], [cos_16+16]
ret
%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
%define Z2(x) [zcq + o3q + mmsize*(x&1)]
%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
%macro DECL_PASS 2+ ; name, payload
align 16
%1:
DEFINE_ARGS zc, w, n, o1, o3
lea o3q, [nq*3]
lea o1q, [nq*8]
shl o3q, 4
.loop:
%2
add zcq, mmsize*2
add wq, mmsize
sub nd, mmsize/8
jg .loop
rep ret
%endmacro
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
lea r2, [dispatch_tab%1]
mov r2, [r2 + (%2q-2)*gprsize]
%ifdef PIC
lea r3, [$$]
add r2, r3
%endif
call r2
%endmacro ; FFT_DISPATCH
INIT_YMM avx
%if HAVE_AVX_EXTERNAL
DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0
cglobal fft_calc, 2,5,8
mov r3d, [r0 + FFTContext.nbits]
mov r0, r1
mov r1, r3
FFT_DISPATCH _interleave %+ SUFFIX, r1
RET
%endif
INIT_XMM sse
DECL_PASS pass_sse, PASS_BIG 1
DECL_PASS pass_interleave_sse, PASS_BIG 0
INIT_XMM sse
cglobal fft_calc, 2,5,8
mov r3d, [r0 + FFTContext.nbits]
PUSH r1
PUSH r3
mov r0, r1
mov r1, r3
FFT_DISPATCH _interleave %+ SUFFIX, r1
POP rcx
POP r4
cmp rcx, 3+(mmsize/16)
jg .end
mov r2, -1
add rcx, 3
shl r2, cl
sub r4, r2
.loop:
movaps xmm0, [r4 + r2]
movaps xmm1, xmm0
unpcklps xmm0, [r4 + r2 + 16]
unpckhps xmm1, [r4 + r2 + 16]
movaps [r4 + r2], xmm0
movaps [r4 + r2 + 16], xmm1
add r2, mmsize*2
jl .loop
.end:
RET
cglobal fft_permute, 2,7,1
mov r4, [r0 + FFTContext.revtab]
mov r5, [r0 + FFTContext.tmpbuf]
mov ecx, [r0 + FFTContext.nbits]
mov r2, 1
shl r2, cl
xor r0, r0
%if ARCH_X86_32
mov r1, r1m
%endif
.loop:
movaps xmm0, [r1 + 8*r0]
movzx r6, word [r4 + 2*r0]
movzx r3, word [r4 + 2*r0 + 2]
movlps [r5 + 8*r6], xmm0
movhps [r5 + 8*r3], xmm0
add r0, 2
cmp r0, r2
jl .loop
shl r2, 3
add r1, r2
add r5, r2
neg r2
; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
.loopcopy:
movaps xmm0, [r5 + r2]
movaps xmm1, [r5 + r2 + 16]
movaps [r1 + r2], xmm0
movaps [r1 + r2 + 16], xmm1
add r2, 32
jl .loopcopy
RET
INIT_XMM sse
cglobal imdct_calc, 3,5,3
mov r3d, [r0 + FFTContext.mdctsize]
mov r4, [r0 + FFTContext.imdcthalf]
add r1, r3
PUSH r3
PUSH r1
%if ARCH_X86_32
push r2
push r1
push r0
%else
sub rsp, 8+32*WIN64 ; allocate win64 shadow space
%endif
call r4
%if ARCH_X86_32
add esp, 12
%else
add rsp, 8+32*WIN64
%endif
POP r1
POP r3
lea r0, [r1 + 2*r3]
mov r2, r3
sub r3, mmsize
neg r2
mova m2, [ps_neg]
.loop:
mova m0, [r1 + r3]
mova m1, [r0 + r2]
shufps m0, m0, 0x1b
shufps m1, m1, 0x1b
xorps m0, m2
mova [r0 + r3], m1
mova [r1 + r2], m0
sub r3, mmsize
add r2, mmsize
jl .loop
RET
%ifdef PIC
%define SECTION_REL - $$
%else
%define SECTION_REL
%endif
%macro DECL_FFT 1-2 ; nbits, suffix
%ifidn %0, 1
%xdefine fullsuffix SUFFIX
%else
%xdefine fullsuffix %2 %+ SUFFIX
%endif
%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
%if %1>=5
%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
%endif
%if %1>=6
%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
%endif
%assign n 1<<%1
%rep 18-%1
%assign n2 n/2
%assign n4 n/4
%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
align 16
fft %+ n %+ fullsuffix:
call fft %+ n2 %+ SUFFIX
add r0, n*4 - (n&(-2<<%1))
call fft %+ n4 %+ SUFFIX
add r0, n*2 - (n2&(-2<<%1))
call fft %+ n4 %+ SUFFIX
sub r0, n*6 + (n2&(-2<<%1))
lea r1, [cos_ %+ n]
mov r2d, n4/2
jmp pass %+ fullsuffix
%assign n n*2
%endrep
%undef n
align 8
dispatch_tab %+ fullsuffix: pointer list_of_fft
%endmacro ; DECL_FFT
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
DECL_FFT 6
DECL_FFT 6, _interleave
%endif
INIT_XMM sse
DECL_FFT 5
DECL_FFT 5, _interleave
INIT_XMM sse
%undef mulps
%undef addps
%undef subps
%undef unpcklps
%undef unpckhps
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
movaps xmm0, [%3+%2*4]
movaps xmm1, [%3+%1*4-0x10]
movaps xmm2, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm1, xmm2, 0x77
movlps xmm4, [%4+%2*2]
movlps xmm5, [%5+%2*2+0x0]
movhps xmm4, [%4+%1*2-0x8]
movhps xmm5, [%5+%1*2-0x8]
movaps xmm2, xmm0
movaps xmm3, xmm1
mulps xmm0, xmm5
mulps xmm1, xmm4
mulps xmm2, xmm4
mulps xmm3, xmm5
subps xmm1, xmm0
addps xmm2, xmm3
movaps xmm0, xmm1
unpcklps xmm1, xmm2
unpckhps xmm0, xmm2
%endmacro
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
mulps m6, %3, [%5+%1]
mulps m7, %2, [%5+%1]
mulps %2, %2, [%6+%1]
mulps %3, %3, [%6+%1]
subps %2, %2, m6
addps %3, %3, m7
%endmacro
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
.post:
%if cpuflag(avx)
vmovaps ymm1, [%3+%1*2]
vmovaps ymm0, [%3+%1*2+0x20]
vmovaps ymm3, [%3+%2*2]
vmovaps ymm2, [%3+%2*2+0x20]
CMUL %1, ymm0, ymm1, %3, %4, %5
CMUL %2, ymm2, ymm3, %3, %4, %5
vshufps ymm1, ymm1, ymm1, 0x1b
vshufps ymm3, ymm3, ymm3, 0x1b
vperm2f128 ymm1, ymm1, ymm1, 0x01
vperm2f128 ymm3, ymm3, ymm3, 0x01
vunpcklps ymm6, ymm2, ymm1
vunpckhps ymm4, ymm2, ymm1
vunpcklps ymm7, ymm0, ymm3
vunpckhps ymm5, ymm0, ymm3
vextractf128 [%3+%1*2], ymm7, 0
vextractf128 [%3+%1*2+0x10], ymm5, 0
vextractf128 [%3+%1*2+0x20], ymm7, 1
vextractf128 [%3+%1*2+0x30], ymm5, 1
vextractf128 [%3+%2*2], ymm6, 0
vextractf128 [%3+%2*2+0x10], ymm4, 0
vextractf128 [%3+%2*2+0x20], ymm6, 1
vextractf128 [%3+%2*2+0x30], ymm4, 1
sub %2, 0x20
add %1, 0x20
jl .post
%else
movaps xmm1, [%3+%1*2]
movaps xmm0, [%3+%1*2+0x10]
CMUL %1, xmm0, xmm1, %3, %4, %5
movaps xmm5, [%3+%2*2]
movaps xmm4, [%3+%2*2+0x10]
CMUL %2, xmm4, xmm5, %3, %4, %5
shufps xmm1, xmm1, 0x1b
shufps xmm5, xmm5, 0x1b
movaps xmm6, xmm4
unpckhps xmm4, xmm1
unpcklps xmm6, xmm1
movaps xmm2, xmm0
unpcklps xmm0, xmm5
unpckhps xmm2, xmm5
movaps [%3+%2*2], xmm6
movaps [%3+%2*2+0x10], xmm4
movaps [%3+%1*2], xmm0
movaps [%3+%1*2+0x10], xmm2
sub %2, 0x10
add %1, 0x10
jl .post
%endif
%endmacro
%macro DECL_IMDCT 0
cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
%if ARCH_X86_64
%define rrevtab r7
%define rtcos r8
%define rtsin r9
%else
%define rrevtab r6
%define rtsin r6
%define rtcos r5
%endif
mov r3d, [r0+FFTContext.mdctsize]
add r2, r3
shr r3, 1
mov rtcos, [r0+FFTContext.tcos]
mov rtsin, [r0+FFTContext.tsin]
add rtcos, r3
add rtsin, r3
%if ARCH_X86_64 == 0
push rtcos
push rtsin
%endif
shr r3, 1
mov rrevtab, [r0+FFTContext.revtab]
add rrevtab, r3
%if ARCH_X86_64 == 0
push rrevtab
%endif
sub r3, 4
%if ARCH_X86_64
xor r4, r4
sub r4, r3
%endif
.pre:
%if ARCH_X86_64 == 0
;unspill
xor r4, r4
sub r4, r3
mov rtcos, [esp+8]
mov rtsin, [esp+4]
%endif
PREROTATER r4, r3, r2, rtcos, rtsin
%if ARCH_X86_64
movzx r5, word [rrevtab+r4-4]
movzx r6, word [rrevtab+r4-2]
movzx r10, word [rrevtab+r3]
movzx r11, word [rrevtab+r3+2]
movlps [r1+r5 *8], xmm0
movhps [r1+r6 *8], xmm0
movlps [r1+r10*8], xmm1
movhps [r1+r11*8], xmm1
add r4, 4
%else
mov r6, [esp]
movzx r5, word [r6+r4-4]
movzx r4, word [r6+r4-2]
movlps [r1+r5*8], xmm0
movhps [r1+r4*8], xmm0
movzx r5, word [r6+r3]
movzx r4, word [r6+r3+2]
movlps [r1+r5*8], xmm1
movhps [r1+r4*8], xmm1
%endif
sub r3, 4
jns .pre
mov r5, r0
mov r6, r1
mov r0, r1
mov r1d, [r5+FFTContext.nbits]
FFT_DISPATCH SUFFIX, r1
mov r0d, [r5+FFTContext.mdctsize]
add r6, r0
shr r0, 1
%if ARCH_X86_64 == 0
%define rtcos r2
%define rtsin r3
mov rtcos, [esp+8]
mov rtsin, [esp+4]
%endif
neg r0
mov r1, -mmsize
sub r1, r0
POSROTATESHUF r0, r1, r6, rtcos, rtsin
%if ARCH_X86_64 == 0
add esp, 12
%endif
RET
%endmacro
DECL_IMDCT
INIT_YMM avx
%if HAVE_AVX_EXTERNAL
DECL_IMDCT
%endif

View File

@ -1,32 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_X86_FFT_H
#define AVCODEC_X86_FFT_H
#include "libavcodec/fft.h"
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
#endif /* AVCODEC_X86_FFT_H */

View File

@ -1,47 +0,0 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "fft.h"
av_cold void ff_fft_init_x86(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
if (s->nbits > 16)
return;
if (EXTERNAL_SSE(cpu_flags)) {
s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_sse;
s->fft_permute = ff_fft_permute_sse;
s->fft_calc = ff_fft_calc_sse;
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
}
if (EXTERNAL_AVX_FAST(cpu_flags) && s->nbits >= 5) {
s->imdct_half = ff_imdct_half_avx;
s->fft_calc = ff_fft_calc_avx;
s->fft_permutation = FF_FFT_PERM_AVX;
}
}

View File

@ -181,7 +181,6 @@ include $(SRC_PATH)/tests/fate/enc_external.mak
# Must be included after lavf-video.mak
include $(SRC_PATH)/tests/fate/ffmpeg.mak
include $(SRC_PATH)/tests/fate/ffprobe.mak
include $(SRC_PATH)/tests/fate/fft.mak
include $(SRC_PATH)/tests/fate/fifo-muxer.mak
include $(SRC_PATH)/tests/fate/filter-audio.mak
# Must be included after vcodec.mak

View File

@ -1,83 +0,0 @@
define DEF_FFT
FATE_DCT-$(CONFIG_DCT) += fate-dct1d-$(1) fate-idct1d-$(1)
FATE_FFT-$(CONFIG_FFT) += fate-fft-$(1) fate-ifft-$(1)
FATE_MDCT-$(CONFIG_MDCT) += fate-mdct-$(1) fate-imdct-$(1)
FATE_RDFT-$(CONFIG_RDFT) += fate-rdft-$(1) fate-irdft-$(1)
fate-fft-$(N): ARGS = -n$(1)
fate-ifft-$(N): ARGS = -n$(1) -i
fate-mdct-$(N): ARGS = -n$(1) -m
fate-imdct-$(N): ARGS = -n$(1) -m -i
fate-rdft-$(N): ARGS = -n$(1) -r
fate-irdft-$(N): ARGS = -n$(1) -r -i
fate-dct1d-$(N): ARGS = -n$(1) -d
fate-idct1d-$(N): ARGS = -n$(1) -d -i
endef
$(foreach N, 4 5 6 7 8 9 10 11 12, $(eval $(call DEF_FFT,$(N))))
fate-dct-float: $(FATE_DCT-yes)
fate-fft-float: $(FATE_FFT-yes)
fate-mdct-float: $(FATE_MDCT-yes)
fate-rdft-float: $(FATE_RDFT-yes)
FATE_FFT_ALL = $(FATE_DCT-yes) $(FATE_FFT-yes) $(FATE_MDCT-yes) $(FATE_RDFT-yes)
$(FATE_FFT_ALL): libavcodec/tests/fft$(EXESUF)
$(FATE_FFT_ALL): CMD = run libavcodec/tests/fft$(EXESUF) $(CPUFLAGS:%=-c%) $(ARGS)
$(FATE_FFT_ALL): CMP = null
define DEF_FFT_FIXED32
FATE_FFT_FIXED32 += fate-fft-fixed32-$(1) fate-ifft-fixed32-$(1) \
fate-mdct-fixed32-$(1) fate-imdct-fixed32-$(1)
fate-fft-fixed32-$(1): ARGS = -n$(1)
fate-ifft-fixed32-$(1): ARGS = -n$(1) -i
#fate-mdct-fixed32-$(1): ARGS = -n$(1) -m
fate-imdct-fixed32-$(1): ARGS = -n$(1) -m -i
endef
$(foreach N, 4 5 6 7 8 9 10 11 12, $(eval $(call DEF_FFT_FIXED32,$(N))))
fate-fft-fixed32: $(FATE_FFT_FIXED32)
$(FATE_FFT_FIXED32): libavcodec/tests/fft-fixed32$(EXESUF)
$(FATE_FFT_FIXED32): CMD = run libavcodec/tests/fft-fixed32$(EXESUF) $(CPUFLAGS:%=-c%) $(ARGS)
$(FATE_FFT_FIXED32): CMP = null
define DEF_AV_FFT
FATE_AV_DCT-$(CONFIG_DCT) += fate-av-dct1d-$(1) fate-av-idct1d-$(1)
FATE_AV_FFT-$(CONFIG_FFT) += fate-av-fft-$(1) fate-av-ifft-$(1)
FATE_AV_MDCT-$(CONFIG_MDCT) += fate-av-mdct-$(1) fate-av-imdct-$(1)
FATE_AV_RDFT-$(CONFIG_RDFT) += fate-av-rdft-$(1) fate-av-irdft-$(1)
fate-av-fft-$(N): ARGS = -n$(1)
fate-av-ifft-$(N): ARGS = -n$(1) -i
fate-av-mdct-$(N): ARGS = -n$(1) -m
fate-av-imdct-$(N): ARGS = -n$(1) -m -i
fate-av-rdft-$(N): ARGS = -n$(1) -r
fate-av-irdft-$(N): ARGS = -n$(1) -r -i
fate-av-dct1d-$(N): ARGS = -n$(1) -d
fate-av-idct1d-$(N): ARGS = -n$(1) -d -i
endef
$(foreach N, 4 5 6 7 8 9 10 11 12, $(eval $(call DEF_AV_FFT,$(N))))
fate-av-dct-float: $(FATE_AV_DCT-yes)
fate-av-fft-float: $(FATE_AV_FFT-yes)
fate-av-mdct-float: $(FATE_AV_MDCT-yes)
fate-av-rdft-float: $(FATE_AV_RDFT-yes)
FATE_AV_FFT_ALL = $(FATE_AV_DCT-yes) $(FATE_AV_FFT-yes) $(FATE_AV_MDCT-yes) $(FATE_AV_RDFT-yes)
$(FATE_AV_FFT_ALL): libavcodec/tests/avfft$(EXESUF)
$(FATE_AV_FFT_ALL): CMD = run libavcodec/tests/avfft$(EXESUF) $(CPUFLAGS:%=-c%) $(ARGS)
$(FATE_AV_FFT_ALL): CMP = null
fate-dct: fate-dct-float
fate-fft: fate-fft-float fate-fft-fixed32
fate-mdct: fate-mdct-float
fate-rdft: fate-rdft-float
FATE-$(call ALLYES, AVCODEC FFT MDCT) += $(FATE_FFT_ALL) $(FATE_FFT_FIXED32) $(FATE_AV_FFT_ALL)
fate-fft-all: $(FATE_FFT_ALL) $(FATE_FFT_FIXED32) $(FATE_AV_FFT_ALL)