avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for HEVC uni mc epel functions

This patch adds MSA (MIPS-SIMD-Arch) optimizations for HEVC uni mc epel functions.
Adds new generic macros (needed for this patch) in libavutil/mips/generic_macros_msa.h

Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
Shivraj Patil 2015-06-02 14:08:12 +05:30 committed by Michael Niedermayer
parent c96c73b0b0
commit aef34ab950
4 changed files with 2286 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@ -129,6 +129,36 @@ static av_cold void hevc_dsp_init_msa(HEVCDSPContext *c,
c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_8_msa;
c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_8_msa;
c->put_hevc_epel_uni[3][0][0] = ff_hevc_put_hevc_uni_pel_pixels8_8_msa;
c->put_hevc_epel_uni[4][0][0] = ff_hevc_put_hevc_uni_pel_pixels12_8_msa;
c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels16_8_msa;
c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels24_8_msa;
c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_msa;
c->put_hevc_epel_uni[1][0][1] = ff_hevc_put_hevc_uni_epel_h4_8_msa;
c->put_hevc_epel_uni[2][0][1] = ff_hevc_put_hevc_uni_epel_h6_8_msa;
c->put_hevc_epel_uni[3][0][1] = ff_hevc_put_hevc_uni_epel_h8_8_msa;
c->put_hevc_epel_uni[4][0][1] = ff_hevc_put_hevc_uni_epel_h12_8_msa;
c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_8_msa;
c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_8_msa;
c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_msa;
c->put_hevc_epel_uni[1][1][0] = ff_hevc_put_hevc_uni_epel_v4_8_msa;
c->put_hevc_epel_uni[2][1][0] = ff_hevc_put_hevc_uni_epel_v6_8_msa;
c->put_hevc_epel_uni[3][1][0] = ff_hevc_put_hevc_uni_epel_v8_8_msa;
c->put_hevc_epel_uni[4][1][0] = ff_hevc_put_hevc_uni_epel_v12_8_msa;
c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_8_msa;
c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_8_msa;
c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_msa;
c->put_hevc_epel_uni[1][1][1] = ff_hevc_put_hevc_uni_epel_hv4_8_msa;
c->put_hevc_epel_uni[2][1][1] = ff_hevc_put_hevc_uni_epel_hv6_8_msa;
c->put_hevc_epel_uni[3][1][1] = ff_hevc_put_hevc_uni_epel_hv8_8_msa;
c->put_hevc_epel_uni[4][1][1] = ff_hevc_put_hevc_uni_epel_hv12_8_msa;
c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_8_msa;
c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_msa;
c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_msa;
c->put_hevc_qpel_uni_w[1][0][0] =
ff_hevc_put_hevc_uni_w_pel_pixels4_8_msa;
c->put_hevc_qpel_uni_w[3][0][0] =

View File

@ -145,6 +145,36 @@ UNI_MC(qpel, hv, 32);
UNI_MC(qpel, hv, 48);
UNI_MC(qpel, hv, 64);
UNI_MC(epel, h, 4);
UNI_MC(epel, h, 6);
UNI_MC(epel, h, 8);
UNI_MC(epel, h, 12);
UNI_MC(epel, h, 16);
UNI_MC(epel, h, 24);
UNI_MC(epel, h, 32);
UNI_MC(epel, h, 48);
UNI_MC(epel, h, 64);
UNI_MC(epel, v, 4);
UNI_MC(epel, v, 6);
UNI_MC(epel, v, 8);
UNI_MC(epel, v, 12);
UNI_MC(epel, v, 16);
UNI_MC(epel, v, 24);
UNI_MC(epel, v, 32);
UNI_MC(epel, v, 48);
UNI_MC(epel, v, 64);
UNI_MC(epel, hv, 4);
UNI_MC(epel, hv, 6);
UNI_MC(epel, hv, 8);
UNI_MC(epel, hv, 12);
UNI_MC(epel, hv, 16);
UNI_MC(epel, hv, 24);
UNI_MC(epel, hv, 32);
UNI_MC(epel, hv, 48);
UNI_MC(epel, hv, 64);
#undef UNI_MC
#define UNI_W_MC(PEL, DIR, WIDTH) \

View File

@ -291,6 +291,7 @@
LD_B2(RTYPE, (psrc), stride, out0, out1); \
out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
}
#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
#define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
@ -573,6 +574,18 @@
SH(out7_m, (pblk_6x4_m + 4)); \
}
/* Description : Store as 8x1 byte block to destination memory from input vector
Arguments : Inputs - in, pdst
Details : Index 0 double word element from input vector 'in' is copied
and stored to destination memory at (pdst)
*/
#define ST8x1_UB(in, pdst) \
{ \
uint64_t out0_m; \
out0_m = __msa_copy_u_d((v2i64) in, 0); \
SD(out0_m, pdst); \
}
/* Description : Store as 8x2 byte block to destination memory from input vector
Arguments : Inputs - in, pdst, stride
Details : Index 0 double word element from input vector 'in' is copied
@ -716,6 +729,23 @@
}
#define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
/* Description : Immediate number of columns to slide
Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
Outputs - out0, out1
Return Type - as per RTYPE
Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
number of elements specified by 'slide_val'
*/
#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
{ \
out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val); \
out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val); \
}
#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
#define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
/* Description : Shuffle byte vector elements as per mask vector
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
Outputs - out0, out1
@ -1090,6 +1120,16 @@
#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
#define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
{ \
ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
}
#define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
#define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
#define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3) \
@ -1306,6 +1346,7 @@
out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
}
#define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
@ -1427,7 +1468,9 @@
in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
}
#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
#define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
#define XORI_B3_128(RTYPE, in0, in1, in2) \
{ \
@ -1628,6 +1671,14 @@
#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
{ \
SRARI_H2(RTYPE, in0, in1, shift); \
SRARI_H2(RTYPE, in2, in3, shift); \
}
#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
/* Description : Shift right arithmetic rounded (immediate)
Arguments : Inputs - in0, in1, shift
Outputs - in0, in1 (in place)