mirror of https://code.videolan.org/videolan/dav1d
6133 lines
216 KiB
NASM
6133 lines
216 KiB
NASM
; Copyright © 2020, VideoLAN and dav1d authors
|
|
; Copyright © 2020, Two Orioles, LLC
|
|
; All rights reserved.
|
|
;
|
|
; Redistribution and use in source and binary forms, with or without
|
|
; modification, are permitted provided that the following conditions are met:
|
|
;
|
|
; 1. Redistributions of source code must retain the above copyright notice, this
|
|
; list of conditions and the following disclaimer.
|
|
;
|
|
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
; this list of conditions and the following disclaimer in the documentation
|
|
; and/or other materials provided with the distribution.
|
|
;
|
|
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
%include "config.asm"
|
|
%include "ext/x86/x86inc.asm"
|
|
|
|
%if ARCH_X86_64
|
|
|
|
SECTION_RODATA 64
|
|
|
|
spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
|
|
db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41
|
|
spel_h_shufC: db 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17
|
|
db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49
|
|
db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25
|
|
db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57
|
|
spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
|
|
db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45
|
|
spel_h_shufD: db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21
|
|
db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53
|
|
db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29
|
|
db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61
|
|
spel_v_shuf8: db 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23
|
|
db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
|
|
db 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
|
|
db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
|
|
spel_v_shuf16: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
|
|
db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
|
|
db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
|
|
db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
|
|
prep_endA: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
|
|
db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
|
|
db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
|
|
db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
|
|
prep_endB: db 1, 2, 5, 6, 9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46
|
|
db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62
|
|
db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110
|
|
db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126
|
|
prep_endC: db 1, 2, 5, 6, 9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78
|
|
db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94
|
|
db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110
|
|
db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126
|
|
spel_shuf4a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
|
|
db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46
|
|
db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
|
|
db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78
|
|
spel_shuf4b: db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78
|
|
db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
|
|
db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110
|
|
db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
|
|
spel_shuf8a: db 1, 2, 17, 18, 5, 6, 21, 22, 9, 10, 25, 26, 13, 14, 29, 30
|
|
db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78
|
|
db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
|
|
db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110
|
|
spel_shuf8b: db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78
|
|
db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
|
|
db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110
|
|
db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
|
|
spel_shuf16: db 1, 2, 33, 34, 5, 6, 37, 38, 9, 10, 41, 42, 13, 14, 45, 46
|
|
db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62
|
|
db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110
|
|
db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126
|
|
spel_shuf32: db 1, 2, 65, 66, 5, 6, 69, 70, 9, 10, 73, 74, 13, 14, 77, 78
|
|
db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94
|
|
db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110
|
|
db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126
|
|
spel_h_shuf2b: db 1, 2, 17, 18, 5, 6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38
|
|
db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50, 9, 10, 53, 54, 13, 14
|
|
db 9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46
|
|
spel_shuf2: db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30
|
|
spel_h_shuf2a: db 0, 1, 2, 3, 2, 3, 4, 5, 16, 17, 18, 19, 18, 19, 20, 21
|
|
db 4, 5, 6, 7, 6, 7, 8, 9, 20, 21, 22, 23, 22, 23, 24, 25
|
|
w_mask_end42x: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
|
|
db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
|
|
w_mask_end444: db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
|
|
db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
|
|
db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
|
|
db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
|
|
w_mask_shuf4: db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
|
|
db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
|
|
db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94
|
|
db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126
|
|
w_mask_shuf8: db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
|
|
db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
|
|
db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94
|
|
db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126
|
|
w_mask_shuf16: db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
|
|
db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
|
|
db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110
|
|
db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126
|
|
warp8x8_permA: db 0, 1, 2, 3, 32, 33, 34, 35, 2, 3, 4, 5, 34, 35, 36, 37
|
|
db 4, 5, 6, 7, 36, 37, 38, 39, 6, 7, 8, 9, 38, 39, 40, 41
|
|
db 8, 9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45
|
|
db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
|
|
warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
|
|
db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53
|
|
db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57
|
|
db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61
|
|
warp8x8_end: db 0, 1, 4, 5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
|
|
db 2, 3, 6, 7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
|
|
db 8, 9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
|
|
db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
|
|
deint_q_shuf: ;dq 0, 2, 4, 6, 1, 3, 5, 7
|
|
pd_0to7: dd 0, 1, 2, 3, 4, 5, 6, 7
|
|
dd 1
|
|
pw_2048: times 2 dw 2048
|
|
dd 3
|
|
pw_8192: times 2 dw 8192
|
|
avg_shift: dw 5, 5, 3, 3
|
|
pw_27615: times 2 dw 27615
|
|
pw_32766: times 2 dw 32766
|
|
warp8x8_permC: db -1, 0, -1, 1, -1, 8, -1, 9, -1, 4, -1, 5, -1, 12, -1, 13
|
|
warp8x8_permD: db -1, 2, -1, 3, -1, 10, -1, 11, -1, 6, -1, 7, -1, 14, -1, 15
|
|
warp_shift_h: db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
|
|
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
|
|
resize_permA: dd 0, 4, 8, 12, 1, 5, 9, 13, 16, 20, 24, 28, 17, 21, 25, 29
|
|
resize_permB: dd 2, 6, 10, 14, 3, 7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
|
|
resize_permC: dq 0, 1, 4, 5, 8, 9, 12, 13
|
|
resize_permD: dq 2, 3, 6, 7, 10, 11, 14, 15
|
|
resize_permE: dq 0, 2, 4, 6
|
|
resize_shufA: db -1, 0, -1, 1, -1, 4, -1, 5, -1, 8, -1, 9, -1, 12, -1, 13
|
|
resize_shufB: db -1, 2, -1, 3, -1, 6, -1, 7, -1, 10, -1, 11, -1, 14, -1, 15
|
|
rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
|
resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7
|
|
db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
|
|
|
|
prep_hv_shift: dq 6, 4
|
|
put_bilin_h_rnd: dw 8, 8, 10, 10
|
|
prep_mul: dw 16, 16, 4, 4
|
|
put_8tap_h_rnd: dd 34, 40
|
|
prep_8tap_rnd: dd 128 - (8192 << 8)
|
|
warp_8x8_rnd_h: dd 512, 2048
|
|
warp_8x8_rnd_v: dd 262144, 65536
|
|
warp_8x8t_rnd_v: dd 16384 - (8192 << 15)
|
|
avg_round: dw -16400, -16400, -16388, -16388
|
|
w_avg_round: dd 128 + (8192 << 4), 32 + (8192 << 4)
|
|
mask_round: dd 512 + (8192 << 6), 128 + (8192 << 6)
|
|
w_mask_round: dd 128, 64
|
|
bidir_shift: dw 6, 6, 4, 4
|
|
|
|
pb_64: times 4 db 64
|
|
pw_m512: times 2 dw -512
|
|
pw_2: times 2 dw 2
|
|
pw_64: times 2 dw 64
|
|
pd_32: dd 32
|
|
pd_63: dd 63
|
|
pd_128: dd 128
|
|
pd_640: dd 640
|
|
pd_2176: dd 2176
|
|
pd_16384: dd 16384
|
|
pd_0_4: dd 0, 4
|
|
|
|
%define pw_16 prep_mul
|
|
%define pd_512 warp_8x8_rnd_h
|
|
|
|
%macro BASE_JMP_TABLE 3-*
|
|
%xdefine %1_%2_table (%%table - %3)
|
|
%xdefine %%base %1_%2
|
|
%%table:
|
|
%rep %0 - 2
|
|
dw %%base %+ _w%3 - %%base
|
|
%rotate 1
|
|
%endrep
|
|
%endmacro
|
|
|
|
%macro HV_JMP_TABLE 5-*
|
|
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
|
|
%xdefine %%base %1_%3
|
|
%assign %%types %4
|
|
%if %%types & 1
|
|
%xdefine %1_%2_h_%3_table (%%h - %5)
|
|
%%h:
|
|
%rep %0 - 4
|
|
dw %%prefix %+ .h_w%5 - %%base
|
|
%rotate 1
|
|
%endrep
|
|
%rotate 4
|
|
%endif
|
|
%if %%types & 2
|
|
%xdefine %1_%2_v_%3_table (%%v - %5)
|
|
%%v:
|
|
%rep %0 - 4
|
|
dw %%prefix %+ .v_w%5 - %%base
|
|
%rotate 1
|
|
%endrep
|
|
%rotate 4
|
|
%endif
|
|
%if %%types & 4
|
|
%xdefine %1_%2_hv_%3_table (%%hv - %5)
|
|
%%hv:
|
|
%rep %0 - 4
|
|
dw %%prefix %+ .hv_w%5 - %%base
|
|
%rotate 1
|
|
%endrep
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro BIDIR_JMP_TABLE 2-*
|
|
%xdefine %1_%2_table (%%table - 2*%3)
|
|
%xdefine %%base %1_%2_table
|
|
%xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
|
|
%%table:
|
|
%rep %0 - 2
|
|
dd %%prefix %+ .w%3 - %%base
|
|
%rotate 1
|
|
%endrep
|
|
%endmacro
|
|
|
|
%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put)
|
|
%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep)
|
|
|
|
BIDIR_JMP_TABLE avg, avx512icl, 4, 8, 16, 32, 64, 128
|
|
BIDIR_JMP_TABLE w_avg, avx512icl, 4, 8, 16, 32, 64, 128
|
|
BIDIR_JMP_TABLE mask, avx512icl, 4, 8, 16, 32, 64, 128
|
|
BIDIR_JMP_TABLE w_mask_420, avx512icl, 4, 8, 16, 32, 64, 128
|
|
BIDIR_JMP_TABLE w_mask_422, avx512icl, 4, 8, 16, 32, 64, 128
|
|
BIDIR_JMP_TABLE w_mask_444, avx512icl, 4, 8, 16, 32, 64, 128
|
|
BIDIR_JMP_TABLE blend, avx512icl, 4, 8, 16, 32
|
|
BIDIR_JMP_TABLE blend_v, avx512icl, 2, 4, 8, 16, 32
|
|
BIDIR_JMP_TABLE blend_h, avx512icl, 2, 4, 8, 16, 32, 64, 128
|
|
BASE_JMP_TABLE put, avx512icl, 2, 4, 8, 16, 32, 64, 128
|
|
BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
|
|
HV_JMP_TABLE put, bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
|
|
HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
|
|
HV_JMP_TABLE put, 6tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
|
|
HV_JMP_TABLE put, 8tap, avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
|
|
HV_JMP_TABLE prep, 6tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
|
|
HV_JMP_TABLE prep, 8tap, avx512icl, 2, 4, 8, 16, 32, 64, 128
|
|
|
|
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
|
|
|
|
cextern mc_subpel_filters
|
|
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
|
|
|
|
cextern mc_warp_filter
|
|
cextern obmc_masks_avx2
|
|
cextern resize_filter
|
|
|
|
SECTION .text
|
|
|
|
%if WIN64
|
|
DECLARE_REG_TMP 4
|
|
%else
|
|
DECLARE_REG_TMP 8
|
|
%endif
|
|
|
|
INIT_ZMM avx512icl
|
|
cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy
|
|
mov mxyd, r6m ; mx
|
|
lea r7, [put_avx512icl]
|
|
tzcnt t0d, wm
|
|
movifnidn hd, hm
|
|
test mxyd, mxyd
|
|
jnz .h
|
|
mov mxyd, r7m ; my
|
|
test mxyd, mxyd
|
|
jnz .v
|
|
.put:
|
|
movzx t0d, word [r7+t0*2+table_offset(put,)]
|
|
add t0, r7
|
|
jmp t0
|
|
.put_w2:
|
|
mov r6d, [srcq+ssq*0]
|
|
mov r7d, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
mov [dstq+dsq*0], r6d
|
|
mov [dstq+dsq*1], r7d
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .put_w2
|
|
RET
|
|
.put_w4:
|
|
mov r6, [srcq+ssq*0]
|
|
mov r7, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
mov [dstq+dsq*0], r6
|
|
mov [dstq+dsq*1], r7
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .put_w4
|
|
RET
|
|
.put_w8:
|
|
movu xmm0, [srcq+ssq*0]
|
|
movu xmm1, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
mova [dstq+dsq*0], xmm0
|
|
mova [dstq+dsq*1], xmm1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .put_w8
|
|
RET
|
|
.put_w16:
|
|
movu ym0, [srcq+ssq*0]
|
|
movu ym1, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
mova [dstq+dsq*0], ym0
|
|
mova [dstq+dsq*1], ym1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .put_w16
|
|
RET
|
|
.put_w32:
|
|
movu m0, [srcq+ssq*0]
|
|
movu m1, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
mova [dstq+dsq*0], m0
|
|
mova [dstq+dsq*1], m1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .put_w32
|
|
RET
|
|
.put_w64:
|
|
movu m0, [srcq+ssq*0+64*0]
|
|
movu m1, [srcq+ssq*0+64*1]
|
|
movu m2, [srcq+ssq*1+64*0]
|
|
movu m3, [srcq+ssq*1+64*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
mova [dstq+dsq*0+64*0], m0
|
|
mova [dstq+dsq*0+64*1], m1
|
|
mova [dstq+dsq*1+64*0], m2
|
|
mova [dstq+dsq*1+64*1], m3
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .put_w64
|
|
RET
|
|
.put_w128:
|
|
movu m0, [srcq+64*0]
|
|
movu m1, [srcq+64*1]
|
|
movu m2, [srcq+64*2]
|
|
movu m3, [srcq+64*3]
|
|
add srcq, ssq
|
|
mova [dstq+64*0], m0
|
|
mova [dstq+64*1], m1
|
|
mova [dstq+64*2], m2
|
|
mova [dstq+64*3], m3
|
|
add dstq, dsq
|
|
dec hd
|
|
jg .put_w128
|
|
RET
|
|
.h:
|
|
vpbroadcastw m5, mxyd
|
|
mov mxyd, r7m ; my
|
|
vpbroadcastd m4, [pw_16]
|
|
psubw m4, m5
|
|
test mxyd, mxyd
|
|
jnz .hv
|
|
; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
|
|
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_h)]
|
|
mov r6d, r8m ; bitdepth_max
|
|
add t0, r7
|
|
shr r6d, 11
|
|
vpbroadcastd m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
|
|
jmp t0
|
|
.h_w2:
|
|
movq xmm1, [srcq+ssq*0]
|
|
movhps xmm1, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
pmullw xmm0, xmm1, xm4
|
|
psrlq xmm1, 16
|
|
pmullw xmm1, xm5
|
|
paddw xmm0, xm6
|
|
paddw xmm0, xmm1
|
|
psrlw xmm0, 4
|
|
movd [dstq+dsq*0], xmm0
|
|
pextrd [dstq+dsq*1], xmm0, 2
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .h_w2
|
|
RET
|
|
.h_w4:
|
|
movq xmm0, [srcq+ssq*0+0]
|
|
movhps xmm0, [srcq+ssq*1+0]
|
|
movq xmm1, [srcq+ssq*0+2]
|
|
movhps xmm1, [srcq+ssq*1+2]
|
|
lea srcq, [srcq+ssq*2]
|
|
pmullw xmm0, xm4
|
|
pmullw xmm1, xm5
|
|
paddw xmm0, xm6
|
|
paddw xmm0, xmm1
|
|
psrlw xmm0, 4
|
|
movq [dstq+dsq*0], xmm0
|
|
movhps [dstq+dsq*1], xmm0
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .h_w4
|
|
RET
|
|
.h_w8:
|
|
movu xm0, [srcq+ssq*0+0]
|
|
vinserti32x4 ym0, [srcq+ssq*1+0], 1
|
|
movu xm1, [srcq+ssq*0+2]
|
|
vinserti32x4 ym1, [srcq+ssq*1+2], 1
|
|
lea srcq, [srcq+ssq*2]
|
|
pmullw ym0, ym4
|
|
pmullw ym1, ym5
|
|
paddw ym0, ym6
|
|
paddw ym0, ym1
|
|
psrlw ym0, 4
|
|
mova [dstq+dsq*0], xm0
|
|
vextracti32x4 [dstq+dsq*1], ym0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .h_w8
|
|
RET
|
|
.h_w16:
|
|
movu ym0, [srcq+ssq*0+0]
|
|
vinserti32x8 m0, [srcq+ssq*1+0], 1
|
|
movu ym1, [srcq+ssq*0+2]
|
|
vinserti32x8 m1, [srcq+ssq*1+2], 1
|
|
lea srcq, [srcq+ssq*2]
|
|
pmullw m0, m4
|
|
pmullw m1, m5
|
|
paddw m0, m6
|
|
paddw m0, m1
|
|
psrlw m0, 4
|
|
mova [dstq+dsq*0], ym0
|
|
vextracti32x8 [dstq+dsq*1], m0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .h_w16
|
|
RET
|
|
.h_w32:
|
|
pmullw m0, m4, [srcq+ssq*0+0]
|
|
pmullw m2, m5, [srcq+ssq*0+2]
|
|
pmullw m1, m4, [srcq+ssq*1+0]
|
|
pmullw m3, m5, [srcq+ssq*1+2]
|
|
lea srcq, [srcq+ssq*2]
|
|
paddw m0, m6
|
|
paddw m1, m6
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
psrlw m0, 4
|
|
psrlw m1, 4
|
|
mova [dstq+dsq*0], m0
|
|
mova [dstq+dsq*1], m1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .h_w32
|
|
RET
|
|
.h_w64:
|
|
pmullw m0, m4, [srcq+64*0+0]
|
|
pmullw m2, m5, [srcq+64*0+2]
|
|
pmullw m1, m4, [srcq+64*1+0]
|
|
pmullw m3, m5, [srcq+64*1+2]
|
|
add srcq, ssq
|
|
paddw m0, m6
|
|
paddw m1, m6
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
psrlw m0, 4
|
|
psrlw m1, 4
|
|
mova [dstq+64*0], m0
|
|
mova [dstq+64*1], m1
|
|
add dstq, dsq
|
|
dec hd
|
|
jg .h_w64
|
|
RET
|
|
.h_w128:
|
|
pmullw m0, m4, [srcq+64*0+0]
|
|
pmullw m7, m5, [srcq+64*0+2]
|
|
pmullw m1, m4, [srcq+64*1+0]
|
|
pmullw m8, m5, [srcq+64*1+2]
|
|
pmullw m2, m4, [srcq+64*2+0]
|
|
pmullw m9, m5, [srcq+64*2+2]
|
|
pmullw m3, m4, [srcq+64*3+0]
|
|
pmullw m10, m5, [srcq+64*3+2]
|
|
add srcq, ssq
|
|
REPX {paddw x, m6}, m0, m1, m2, m3
|
|
paddw m0, m7
|
|
paddw m1, m8
|
|
paddw m2, m9
|
|
paddw m3, m10
|
|
REPX {psrlw x, 4}, m0, m1, m2, m3
|
|
mova [dstq+64*0], m0
|
|
mova [dstq+64*1], m1
|
|
mova [dstq+64*2], m2
|
|
mova [dstq+64*3], m3
|
|
add dstq, dsq
|
|
dec hd
|
|
jg .h_w128
|
|
RET
|
|
.v:
|
|
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_v)]
|
|
shl mxyd, 11
|
|
vpbroadcastw m8, mxyd
|
|
add t0, r7
|
|
jmp t0
|
|
.v_w2:
|
|
movd xmm0, [srcq+ssq*0]
|
|
.v_w2_loop:
|
|
movd xmm1, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
punpckldq xmm2, xmm0, xmm1
|
|
movd xmm0, [srcq+ssq*0]
|
|
punpckldq xmm1, xmm0
|
|
psubw xmm1, xmm2
|
|
pmulhrsw xmm1, xm8
|
|
paddw xmm1, xmm2
|
|
movd [dstq+dsq*0], xmm1
|
|
pextrd [dstq+dsq*1], xmm1, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w2_loop
|
|
RET
|
|
.v_w4:
|
|
movq xmm0, [srcq+ssq*0]
|
|
.v_w4_loop:
|
|
movq xmm1, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
punpcklqdq xmm2, xmm0, xmm1
|
|
movq xmm0, [srcq+ssq*0]
|
|
punpcklqdq xmm1, xmm0
|
|
psubw xmm1, xmm2
|
|
pmulhrsw xmm1, xm8
|
|
paddw xmm1, xmm2
|
|
movq [dstq+dsq*0], xmm1
|
|
movhps [dstq+dsq*1], xmm1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w4_loop
|
|
RET
|
|
.v_w8:
|
|
movu xmm0, [srcq+ssq*0]
|
|
.v_w8_loop:
|
|
vbroadcasti128 ymm1, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vpblendd ymm2, ymm0, ymm1, 0xf0
|
|
vbroadcasti128 ymm0, [srcq+ssq*0]
|
|
vpblendd ymm1, ymm0, 0xf0
|
|
psubw ymm1, ymm2
|
|
pmulhrsw ymm1, ym8
|
|
paddw ymm1, ymm2
|
|
mova [dstq+dsq*0], xmm1
|
|
vextracti128 [dstq+dsq*1], ymm1, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w8_loop
|
|
vzeroupper
|
|
RET
|
|
.v_w16:
|
|
movu ym0, [srcq+ssq*0]
|
|
.v_w16_loop:
|
|
movu ym3, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
psubw ym1, ym3, ym0
|
|
pmulhrsw ym1, ym8
|
|
paddw ym1, ym0
|
|
movu ym0, [srcq+ssq*0]
|
|
psubw ym2, ym0, ym3
|
|
pmulhrsw ym2, ym8
|
|
paddw ym2, ym3
|
|
mova [dstq+dsq*0], ym1
|
|
mova [dstq+dsq*1], ym2
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w16_loop
|
|
RET
|
|
.v_w32:
|
|
movu m0, [srcq+ssq*0]
|
|
.v_w32_loop:
|
|
movu m3, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
psubw m1, m3, m0
|
|
pmulhrsw m1, m8
|
|
paddw m1, m0
|
|
movu m0, [srcq+ssq*0]
|
|
psubw m2, m0, m3
|
|
pmulhrsw m2, m8
|
|
paddw m2, m3
|
|
mova [dstq+dsq*0], m1
|
|
mova [dstq+dsq*1], m2
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w32_loop
|
|
RET
|
|
.v_w64:
|
|
movu m0, [srcq+ssq*0+64*0]
|
|
movu m1, [srcq+ssq*0+64*1]
|
|
.v_w64_loop:
|
|
movu m2, [srcq+ssq*1+64*0]
|
|
movu m3, [srcq+ssq*1+64*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
psubw m4, m2, m0
|
|
pmulhrsw m4, m8
|
|
paddw m4, m0
|
|
movu m0, [srcq+ssq*0+64*0]
|
|
psubw m5, m3, m1
|
|
pmulhrsw m5, m8
|
|
paddw m5, m1
|
|
movu m1, [srcq+ssq*0+64*1]
|
|
psubw m6, m0, m2
|
|
pmulhrsw m6, m8
|
|
psubw m7, m1, m3
|
|
pmulhrsw m7, m8
|
|
mova [dstq+dsq*0+64*0], m4
|
|
mova [dstq+dsq*0+64*1], m5
|
|
paddw m6, m2
|
|
paddw m7, m3
|
|
mova [dstq+dsq*1+64*0], m6
|
|
mova [dstq+dsq*1+64*1], m7
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w64_loop
|
|
RET
|
|
.v_w128:
|
|
movu m0, [srcq+ssq*0+64*0]
|
|
movu m1, [srcq+ssq*0+64*1]
|
|
movu m2, [srcq+ssq*0+64*2]
|
|
movu m3, [srcq+ssq*0+64*3]
|
|
.v_w128_loop:
|
|
movu m4, [srcq+ssq*1+64*0]
|
|
movu m5, [srcq+ssq*1+64*1]
|
|
movu m6, [srcq+ssq*1+64*2]
|
|
movu m7, [srcq+ssq*1+64*3]
|
|
lea srcq, [srcq+ssq*2]
|
|
psubw m9, m4, m0
|
|
pmulhrsw m9, m8
|
|
paddw m9, m0
|
|
movu m0, [srcq+ssq*0+64*0]
|
|
psubw m10, m5, m1
|
|
pmulhrsw m10, m8
|
|
paddw m10, m1
|
|
movu m1, [srcq+ssq*0+64*1]
|
|
psubw m11, m6, m2
|
|
pmulhrsw m11, m8
|
|
paddw m11, m2
|
|
movu m2, [srcq+ssq*0+64*2]
|
|
psubw m12, m7, m3
|
|
pmulhrsw m12, m8
|
|
paddw m12, m3
|
|
movu m3, [srcq+ssq*0+64*3]
|
|
mova [dstq+dsq*0+64*0], m9
|
|
psubw m9, m0, m4
|
|
pmulhrsw m9, m8
|
|
mova [dstq+dsq*0+64*1], m10
|
|
psubw m10, m1, m5
|
|
pmulhrsw m10, m8
|
|
mova [dstq+dsq*0+64*2], m11
|
|
psubw m11, m2, m6
|
|
pmulhrsw m11, m8
|
|
mova [dstq+dsq*0+64*3], m12
|
|
psubw m12, m3, m7
|
|
pmulhrsw m12, m8
|
|
paddw m9, m4
|
|
paddw m10, m5
|
|
mova [dstq+dsq*1+64*0], m9
|
|
mova [dstq+dsq*1+64*1], m10
|
|
paddw m11, m6
|
|
paddw m12, m7
|
|
mova [dstq+dsq*1+64*2], m11
|
|
mova [dstq+dsq*1+64*3], m12
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w128_loop
|
|
RET
|
|
.hv:
|
|
movzx t0d, word [r7+t0*2+table_offset(put, _bilin_hv)]
|
|
shl mxyd, 11
|
|
vpbroadcastd m6, [pw_2]
|
|
vpbroadcastw m7, mxyd
|
|
vpbroadcastd m8, [pw_8192]
|
|
add t0, r7
|
|
test dword r8m, 0x800
|
|
jnz .hv_12bpc
|
|
psllw m4, 2
|
|
psllw m5, 2
|
|
vpbroadcastd m8, [pw_2048]
|
|
.hv_12bpc:
|
|
jmp t0
|
|
.hv_w2:
|
|
vpbroadcastq xmm1, [srcq+ssq*0]
|
|
pmullw xmm0, xmm1, xm4
|
|
psrlq xmm1, 16
|
|
pmullw xmm1, xm5
|
|
paddw xmm0, xm6
|
|
paddw xmm0, xmm1
|
|
psrlw xmm0, 2
|
|
.hv_w2_loop:
|
|
movq xmm2, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
movhps xmm2, [srcq+ssq*0]
|
|
pmullw xmm1, xmm2, xm4
|
|
psrlq xmm2, 16
|
|
pmullw xmm2, xm5
|
|
paddw xmm1, xm6
|
|
paddw xmm1, xmm2
|
|
psrlw xmm1, 2 ; 1 _ 2 _
|
|
shufpd xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _
|
|
mova xmm0, xmm1
|
|
psubw xmm1, xmm2
|
|
paddw xmm1, xmm1
|
|
pmulhw xmm1, xm7
|
|
paddw xmm1, xmm2
|
|
pmulhrsw xmm1, xm8
|
|
movd [dstq+dsq*0], xmm1
|
|
pextrd [dstq+dsq*1], xmm1, 2
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w2_loop
|
|
RET
|
|
.hv_w4:
|
|
pmullw xmm0, xm4, [srcq+ssq*0-8]
|
|
pmullw xmm1, xm5, [srcq+ssq*0-6]
|
|
paddw xmm0, xm6
|
|
paddw xmm0, xmm1
|
|
psrlw xmm0, 2
|
|
.hv_w4_loop:
|
|
movq xmm1, [srcq+ssq*1+0]
|
|
movq xmm2, [srcq+ssq*1+2]
|
|
lea srcq, [srcq+ssq*2]
|
|
movhps xmm1, [srcq+ssq*0+0]
|
|
movhps xmm2, [srcq+ssq*0+2]
|
|
pmullw xmm1, xm4
|
|
pmullw xmm2, xm5
|
|
paddw xmm1, xm6
|
|
paddw xmm1, xmm2
|
|
psrlw xmm1, 2 ; 1 2
|
|
shufpd xmm2, xmm0, xmm1, 0x01 ; 0 1
|
|
mova xmm0, xmm1
|
|
psubw xmm1, xmm2
|
|
paddw xmm1, xmm1
|
|
pmulhw xmm1, xm7
|
|
paddw xmm1, xmm2
|
|
pmulhrsw xmm1, xm8
|
|
movq [dstq+dsq*0], xmm1
|
|
movhps [dstq+dsq*1], xmm1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w4_loop
|
|
RET
|
|
.hv_w8:
|
|
pmullw xmm0, xm4, [srcq+ssq*0+0]
|
|
pmullw xmm1, xm5, [srcq+ssq*0+2]
|
|
paddw xmm0, xm6
|
|
paddw xmm0, xmm1
|
|
psrlw xmm0, 2
|
|
vinserti32x4 ym0, xmm0, 1
|
|
.hv_w8_loop:
|
|
movu xm1, [srcq+ssq*1+0]
|
|
movu xm2, [srcq+ssq*1+2]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x4 ym1, [srcq+ssq*0+0], 1
|
|
vinserti32x4 ym2, [srcq+ssq*0+2], 1
|
|
pmullw ym1, ym4
|
|
pmullw ym2, ym5
|
|
paddw ym1, ym6
|
|
paddw ym1, ym2
|
|
psrlw ym1, 2 ; 1 2
|
|
vshufi32x4 ym2, ym0, ym1, 0x01 ; 0 1
|
|
mova ym0, ym1
|
|
psubw ym1, ym2
|
|
paddw ym1, ym1
|
|
pmulhw ym1, ym7
|
|
paddw ym1, ym2
|
|
pmulhrsw ym1, ym8
|
|
mova [dstq+dsq*0], xm1
|
|
vextracti32x4 [dstq+dsq*1], ym1, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w8_loop
|
|
RET
|
|
.hv_w16:
|
|
pmullw ym0, ym4, [srcq+ssq*0+0]
|
|
pmullw ym1, ym5, [srcq+ssq*0+2]
|
|
paddw ym0, ym6
|
|
paddw ym0, ym1
|
|
psrlw ym0, 2
|
|
vinserti32x8 m0, ym0, 1
|
|
.hv_w16_loop:
|
|
movu ym1, [srcq+ssq*1+0]
|
|
movu ym2, [srcq+ssq*1+2]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m1, [srcq+ssq*0+0], 1
|
|
vinserti32x8 m2, [srcq+ssq*0+2], 1
|
|
pmullw m1, m4
|
|
pmullw m2, m5
|
|
paddw m1, m6
|
|
paddw m1, m2
|
|
psrlw m1, 2 ; 1 2
|
|
vshufi32x4 m2, m0, m1, q1032 ; 0 1
|
|
mova m0, m1
|
|
psubw m1, m2
|
|
paddw m1, m1
|
|
pmulhw m1, m7
|
|
paddw m1, m2
|
|
pmulhrsw m1, m8
|
|
mova [dstq+dsq*0], ym1
|
|
vextracti32x8 [dstq+dsq*1], m1, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w16_loop
|
|
RET
|
|
.hv_w32:
|
|
.hv_w64:
|
|
.hv_w128:
|
|
movifnidn wd, wm
|
|
lea r6d, [hq+wq*8-256]
|
|
mov r4, srcq
|
|
mov r7, dstq
|
|
.hv_w32_loop0:
|
|
pmullw m0, m4, [srcq+ssq*0+0]
|
|
pmullw m1, m5, [srcq+ssq*0+2]
|
|
paddw m0, m6
|
|
paddw m0, m1
|
|
psrlw m0, 2
|
|
.hv_w32_loop:
|
|
pmullw m3, m4, [srcq+ssq*1+0]
|
|
pmullw m1, m5, [srcq+ssq*1+2]
|
|
lea srcq, [srcq+ssq*2]
|
|
paddw m3, m6
|
|
paddw m3, m1
|
|
psrlw m3, 2
|
|
psubw m1, m3, m0
|
|
paddw m1, m1
|
|
pmulhw m1, m7
|
|
paddw m1, m0
|
|
pmullw m0, m4, [srcq+ssq*0+0]
|
|
pmullw m2, m5, [srcq+ssq*0+2]
|
|
paddw m0, m6
|
|
paddw m0, m2
|
|
psrlw m0, 2
|
|
psubw m2, m0, m3
|
|
paddw m2, m2
|
|
pmulhw m2, m7
|
|
paddw m2, m3
|
|
pmulhrsw m1, m8
|
|
pmulhrsw m2, m8
|
|
mova [dstq+dsq*0], m1
|
|
mova [dstq+dsq*1], m2
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w32_loop
|
|
add r4, 64
|
|
add r7, 64
|
|
movzx hd, r6b
|
|
mov srcq, r4
|
|
mov dstq, r7
|
|
sub r6d, 1<<8
|
|
jg .hv_w32_loop0
|
|
RET
|
|
|
|
cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3
|
|
movifnidn mxyd, r5m ; mx
|
|
lea r6, [prep_avx512icl]
|
|
tzcnt wd, wm
|
|
movifnidn hd, hm
|
|
test mxyd, mxyd
|
|
jnz .h
|
|
mov mxyd, r6m ; my
|
|
test mxyd, mxyd
|
|
jnz .v
|
|
.prep:
|
|
movzx wd, word [r6+wq*2+table_offset(prep,)]
|
|
mov r5d, r7m ; bitdepth_max
|
|
vpbroadcastd m5, [r6-prep_avx512icl+pw_8192]
|
|
add wq, r6
|
|
shr r5d, 11
|
|
vpbroadcastd m4, [r6-prep_avx512icl+prep_mul+r5*4]
|
|
lea stride3q, [strideq*3]
|
|
jmp wq
|
|
.prep_w4:
|
|
mov r3d, 0x0c
|
|
kmovb k1, r3d
|
|
.prep_w4_loop:
|
|
movq xm0, [srcq+strideq*0]
|
|
movhps xm0, [srcq+strideq*1]
|
|
vpbroadcastq ym1, [srcq+strideq*2]
|
|
vpunpcklqdq ym0{k1}, ym1, [srcq+stride3q] {1to4}
|
|
lea srcq, [srcq+strideq*4]
|
|
pmullw ym0, ym4
|
|
psubw ym0, ym5
|
|
mova [tmpq], ym0
|
|
add tmpq, 32
|
|
sub hd, 4
|
|
jg .prep_w4_loop
|
|
RET
|
|
.prep_w8:
|
|
movu xm0, [srcq+strideq*0]
|
|
vinserti32x4 ym0, [srcq+strideq*1], 1
|
|
vinserti32x4 m0, [srcq+strideq*2], 2
|
|
vinserti32x4 m0, [srcq+stride3q ], 3
|
|
lea srcq, [srcq+strideq*4]
|
|
pmullw m0, m4
|
|
psubw m0, m5
|
|
mova [tmpq], m0
|
|
add tmpq, 64
|
|
sub hd, 4
|
|
jg .prep_w8
|
|
RET
|
|
.prep_w16:
|
|
movu ym0, [srcq+strideq*0]
|
|
vinserti32x8 m0, [srcq+strideq*1], 1
|
|
movu ym1, [srcq+strideq*2]
|
|
vinserti32x8 m1, [srcq+stride3q ], 1
|
|
lea srcq, [srcq+strideq*4]
|
|
pmullw m0, m4
|
|
pmullw m1, m4
|
|
psubw m0, m5
|
|
psubw m1, m5
|
|
mova [tmpq+64*0], m0
|
|
mova [tmpq+64*1], m1
|
|
add tmpq, 64*2
|
|
sub hd, 4
|
|
jg .prep_w16
|
|
RET
|
|
.prep_w32:
|
|
pmullw m0, m4, [srcq+strideq*0]
|
|
pmullw m1, m4, [srcq+strideq*1]
|
|
pmullw m2, m4, [srcq+strideq*2]
|
|
pmullw m3, m4, [srcq+stride3q ]
|
|
lea srcq, [srcq+strideq*4]
|
|
REPX {psubw x, m5}, m0, m1, m2, m3
|
|
mova [tmpq+64*0], m0
|
|
mova [tmpq+64*1], m1
|
|
mova [tmpq+64*2], m2
|
|
mova [tmpq+64*3], m3
|
|
add tmpq, 64*4
|
|
sub hd, 4
|
|
jg .prep_w32
|
|
RET
|
|
.prep_w64:
|
|
pmullw m0, m4, [srcq+strideq*0+64*0]
|
|
pmullw m1, m4, [srcq+strideq*0+64*1]
|
|
pmullw m2, m4, [srcq+strideq*1+64*0]
|
|
pmullw m3, m4, [srcq+strideq*1+64*1]
|
|
lea srcq, [srcq+strideq*2]
|
|
REPX {psubw x, m5}, m0, m1, m2, m3
|
|
mova [tmpq+64*0], m0
|
|
mova [tmpq+64*1], m1
|
|
mova [tmpq+64*2], m2
|
|
mova [tmpq+64*3], m3
|
|
add tmpq, 64*4
|
|
sub hd, 2
|
|
jg .prep_w64
|
|
RET
|
|
.prep_w128:
|
|
pmullw m0, m4, [srcq+64*0]
|
|
pmullw m1, m4, [srcq+64*1]
|
|
pmullw m2, m4, [srcq+64*2]
|
|
pmullw m3, m4, [srcq+64*3]
|
|
add srcq, strideq
|
|
REPX {psubw x, m5}, m0, m1, m2, m3
|
|
mova [tmpq+64*0], m0
|
|
mova [tmpq+64*1], m1
|
|
mova [tmpq+64*2], m2
|
|
mova [tmpq+64*3], m3
|
|
add tmpq, 64*4
|
|
dec hd
|
|
jg .prep_w128
|
|
RET
|
|
.h:
|
|
vpbroadcastw m5, mxyd
|
|
mov mxyd, r6m ; my
|
|
vpbroadcastd m4, [pw_16]
|
|
vpbroadcastd m6, [pw_32766]
|
|
psubw m4, m5
|
|
test dword r7m, 0x800
|
|
jnz .h_12bpc
|
|
psllw m4, 2
|
|
psllw m5, 2
|
|
.h_12bpc:
|
|
test mxyd, mxyd
|
|
jnz .hv
|
|
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
|
|
add wq, r6
|
|
lea stride3q, [strideq*3]
|
|
jmp wq
|
|
.h_w4:
|
|
movu xm1, [srcq+strideq*0]
|
|
vinserti32x4 ym1, [srcq+strideq*2], 1
|
|
movu xm2, [srcq+strideq*1]
|
|
vinserti32x4 ym2, [srcq+stride3q ], 1
|
|
lea srcq, [srcq+strideq*4]
|
|
punpcklqdq ym0, ym1, ym2
|
|
psrldq ym1, 2
|
|
psrldq ym2, 2
|
|
pmullw ym0, ym4
|
|
punpcklqdq ym1, ym2
|
|
pmullw ym1, ym5
|
|
psubw ym0, ym6
|
|
paddw ym0, ym1
|
|
psraw ym0, 2
|
|
mova [tmpq], ym0
|
|
add tmpq, 32
|
|
sub hd, 4
|
|
jg .h_w4
|
|
RET
|
|
.h_w8:
|
|
movu xm0, [srcq+strideq*0+0]
|
|
movu xm1, [srcq+strideq*0+2]
|
|
vinserti32x4 ym0, [srcq+strideq*1+0], 1
|
|
vinserti32x4 ym1, [srcq+strideq*1+2], 1
|
|
vinserti32x4 m0, [srcq+strideq*2+0], 2
|
|
vinserti32x4 m1, [srcq+strideq*2+2], 2
|
|
vinserti32x4 m0, [srcq+stride3q +0], 3
|
|
vinserti32x4 m1, [srcq+stride3q +2], 3
|
|
lea srcq, [srcq+strideq*4]
|
|
pmullw m0, m4
|
|
pmullw m1, m5
|
|
psubw m0, m6
|
|
paddw m0, m1
|
|
psraw m0, 2
|
|
mova [tmpq], m0
|
|
add tmpq, 64
|
|
sub hd, 4
|
|
jg .h_w8
|
|
RET
|
|
.h_w16:
|
|
movu ym0, [srcq+strideq*0+0]
|
|
vinserti32x8 m0, [srcq+strideq*1+0], 1
|
|
movu ym1, [srcq+strideq*0+2]
|
|
vinserti32x8 m1, [srcq+strideq*1+2], 1
|
|
lea srcq, [srcq+strideq*2]
|
|
pmullw m0, m4
|
|
pmullw m1, m5
|
|
psubw m0, m6
|
|
paddw m0, m1
|
|
psraw m0, 2
|
|
mova [tmpq], m0
|
|
add tmpq, 64
|
|
sub hd, 2
|
|
jg .h_w16
|
|
RET
|
|
.h_w32:
|
|
pmullw m0, m4, [srcq+strideq*0+0]
|
|
pmullw m2, m5, [srcq+strideq*0+2]
|
|
pmullw m1, m4, [srcq+strideq*1+0]
|
|
pmullw m3, m5, [srcq+strideq*1+2]
|
|
lea srcq, [srcq+strideq*2]
|
|
psubw m0, m6
|
|
psubw m1, m6
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
psraw m0, 2
|
|
psraw m1, 2
|
|
mova [tmpq+64*0], m0
|
|
mova [tmpq+64*1], m1
|
|
add tmpq, 64*2
|
|
sub hd, 2
|
|
jg .h_w32
|
|
RET
|
|
.h_w64:
|
|
pmullw m0, m4, [srcq+ 0]
|
|
pmullw m2, m5, [srcq+ 2]
|
|
pmullw m1, m4, [srcq+64]
|
|
pmullw m3, m5, [srcq+66]
|
|
add srcq, strideq
|
|
psubw m0, m6
|
|
psubw m1, m6
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
psraw m0, 2
|
|
psraw m1, 2
|
|
mova [tmpq+64*0], m0
|
|
mova [tmpq+64*1], m1
|
|
add tmpq, 64*2
|
|
dec hd
|
|
jg .h_w64
|
|
RET
|
|
.h_w128:
|
|
pmullw m0, m4, [srcq+ 0]
|
|
pmullw m7, m5, [srcq+ 2]
|
|
pmullw m1, m4, [srcq+ 64]
|
|
pmullw m8, m5, [srcq+ 66]
|
|
pmullw m2, m4, [srcq+128]
|
|
pmullw m9, m5, [srcq+130]
|
|
pmullw m3, m4, [srcq+192]
|
|
pmullw m10, m5, [srcq+194]
|
|
add srcq, strideq
|
|
REPX {psubw x, m6}, m0, m1, m2, m3
|
|
paddw m0, m7
|
|
paddw m1, m8
|
|
paddw m2, m9
|
|
paddw m3, m10
|
|
REPX {psraw x, 2}, m0, m1, m2, m3
|
|
mova [tmpq+64*0], m0
|
|
mova [tmpq+64*1], m1
|
|
mova [tmpq+64*2], m2
|
|
mova [tmpq+64*3], m3
|
|
add tmpq, 64*4
|
|
dec hd
|
|
jg .h_w128
|
|
RET
|
|
.v:
|
|
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
|
|
vpbroadcastw m9, mxyd
|
|
vpbroadcastd m8, [pw_16]
|
|
vpbroadcastd m10, [pw_32766]
|
|
add wq, r6
|
|
lea stride3q, [strideq*3]
|
|
psubw m8, m9
|
|
test dword r7m, 0x800
|
|
jnz .v_12bpc
|
|
psllw m8, 2
|
|
psllw m9, 2
|
|
.v_12bpc:
|
|
jmp wq
|
|
.v_w4:
|
|
movq xmm0, [srcq+strideq*0]
|
|
.v_w4_loop:
|
|
vpbroadcastq xmm2, [srcq+strideq*1]
|
|
vpbroadcastq ymm1, [srcq+strideq*2]
|
|
vpbroadcastq ymm3, [srcq+stride3q ]
|
|
lea srcq, [srcq+strideq*4]
|
|
vpblendd ymm2, ymm1, 0x30
|
|
vpblendd ymm2, ymm3, 0xc0
|
|
vpblendd ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3
|
|
movq xmm0, [srcq+strideq*0]
|
|
valignq ymm2, ymm0, ymm2, 1 ; 1 2 3 4
|
|
pmullw ymm1, ym8
|
|
pmullw ymm2, ym9
|
|
psubw ymm1, ym10
|
|
paddw ymm1, ymm2
|
|
psraw ymm1, 2
|
|
mova [tmpq], ymm1
|
|
add tmpq, 32
|
|
sub hd, 4
|
|
jg .v_w4_loop
|
|
vzeroupper
|
|
RET
|
|
.v_w8:
|
|
movu xm0, [srcq+strideq*0]
|
|
.v_w8_loop:
|
|
vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
|
|
vinserti32x4 m1, [srcq+strideq*2], 2
|
|
vinserti32x4 m1, [srcq+stride3q ], 3 ; 0 1 2 3
|
|
lea srcq, [srcq+strideq*4]
|
|
movu xm0, [srcq+strideq*0]
|
|
valignq m2, m0, m1, 2 ; 1 2 3 4
|
|
pmullw m1, m8
|
|
pmullw m2, m9
|
|
psubw m1, m10
|
|
paddw m1, m2
|
|
psraw m1, 2
|
|
mova [tmpq], m1
|
|
add tmpq, 64
|
|
sub hd, 4
|
|
jg .v_w8_loop
|
|
RET
|
|
.v_w16:
|
|
movu ym0, [srcq+strideq*0]
|
|
.v_w16_loop:
|
|
vinserti32x8 m1, m0, [srcq+strideq*1], 1 ; 0 1
|
|
movu ym3, [srcq+strideq*2]
|
|
vinserti32x8 m2, m3, [srcq+stride3q ], 1 ; 2 3
|
|
lea srcq, [srcq+strideq*4]
|
|
movu ym0, [srcq+strideq*0]
|
|
vshufi32x4 m3, m1, m3, q1032 ; 1 2
|
|
vshufi32x4 m4, m2, m0, q1032 ; 3 4
|
|
pmullw m1, m8
|
|
pmullw m2, m8
|
|
pmullw m3, m9
|
|
pmullw m4, m9
|
|
psubw m1, m10
|
|
psubw m2, m10
|
|
paddw m1, m3
|
|
paddw m2, m4
|
|
psraw m1, 2
|
|
psraw m2, 2
|
|
mova [tmpq+64*0], m1
|
|
mova [tmpq+64*1], m2
|
|
add tmpq, 64*2
|
|
sub hd, 4
|
|
jg .v_w16_loop
|
|
RET
|
|
.v_w32:
|
|
movu m0, [srcq+strideq*0]
|
|
.v_w32_loop:
|
|
movu m3, [srcq+strideq*1]
|
|
lea srcq, [srcq+strideq*2]
|
|
pmullw m1, m8, m0
|
|
movu m0, [srcq+strideq*0]
|
|
pmullw m2, m8, m3
|
|
pmullw m3, m9
|
|
pmullw m4, m9, m0
|
|
psubw m1, m10
|
|
psubw m2, m10
|
|
paddw m1, m3
|
|
paddw m2, m4
|
|
psraw m1, 2
|
|
psraw m2, 2
|
|
mova [tmpq+64*0], m1
|
|
mova [tmpq+64*1], m2
|
|
add tmpq, 64*2
|
|
sub hd, 2
|
|
jg .v_w32_loop
|
|
RET
|
|
.v_w64:
|
|
movu m0, [srcq+64*0]
|
|
movu m1, [srcq+64*1]
|
|
.v_w64_loop:
|
|
add srcq, strideq
|
|
pmullw m2, m8, m0
|
|
movu m0, [srcq+64*0]
|
|
pmullw m3, m8, m1
|
|
movu m1, [srcq+64*1]
|
|
pmullw m4, m9, m0
|
|
pmullw m5, m9, m1
|
|
psubw m2, m10
|
|
psubw m3, m10
|
|
paddw m2, m4
|
|
paddw m3, m5
|
|
psraw m2, 2
|
|
psraw m3, 2
|
|
mova [tmpq+64*0], m2
|
|
mova [tmpq+64*1], m3
|
|
add tmpq, 64*2
|
|
dec hd
|
|
jg .v_w64_loop
|
|
RET
|
|
.v_w128:
|
|
movu m0, [srcq+64*0]
|
|
movu m1, [srcq+64*1]
|
|
movu m2, [srcq+64*2]
|
|
movu m3, [srcq+64*3]
|
|
.v_w128_loop:
|
|
add srcq, strideq
|
|
pmullw m4, m8, m0
|
|
movu m0, [srcq+64*0]
|
|
pmullw m5, m8, m1
|
|
movu m1, [srcq+64*1]
|
|
pmullw m6, m8, m2
|
|
movu m2, [srcq+64*2]
|
|
pmullw m7, m8, m3
|
|
movu m3, [srcq+64*3]
|
|
pmullw m11, m9, m0
|
|
pmullw m12, m9, m1
|
|
pmullw m13, m9, m2
|
|
pmullw m14, m9, m3
|
|
REPX {psubw x, m10}, m4, m5, m6, m7
|
|
paddw m4, m11
|
|
paddw m5, m12
|
|
paddw m6, m13
|
|
paddw m7, m14
|
|
REPX {psraw x, 2}, m4, m5, m6, m7
|
|
mova [tmpq+64*0], m4
|
|
mova [tmpq+64*1], m5
|
|
mova [tmpq+64*2], m6
|
|
mova [tmpq+64*3], m7
|
|
add tmpq, 64*4
|
|
dec hd
|
|
jg .v_w128_loop
|
|
RET
|
|
.hv:
|
|
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
|
|
shl mxyd, 11
|
|
vpbroadcastw m7, mxyd
|
|
add wq, r6
|
|
lea stride3q, [strideq*3]
|
|
jmp wq
|
|
.hv_w4:
|
|
movq xmm0, [srcq+strideq*0+0]
|
|
movq xmm1, [srcq+strideq*0+2]
|
|
pmullw xmm0, xm4
|
|
pmullw xmm1, xm5
|
|
psubw xmm0, xm6
|
|
paddw xmm0, xmm1
|
|
psraw xmm0, 2
|
|
vpbroadcastq ym0, xmm0
|
|
.hv_w4_loop:
|
|
movu xm1, [srcq+strideq*1]
|
|
vinserti128 ym1, [srcq+stride3q ], 1
|
|
movu xm2, [srcq+strideq*2]
|
|
lea srcq, [srcq+strideq*4]
|
|
vinserti128 ym2, [srcq+strideq*0], 1
|
|
punpcklqdq ym3, ym1, ym2
|
|
psrldq ym1, 2
|
|
psrldq ym2, 2
|
|
pmullw ym3, ym4
|
|
punpcklqdq ym1, ym2
|
|
pmullw ym1, ym5
|
|
psubw ym3, ym6
|
|
paddw ym1, ym3
|
|
psraw ym1, 2 ; 1 2 3 4
|
|
valignq ym2, ym1, ym0, 3 ; 0 1 2 3
|
|
mova ym0, ym1
|
|
psubw ym1, ym2
|
|
pmulhrsw ym1, ym7
|
|
paddw ym1, ym2
|
|
mova [tmpq], ym1
|
|
add tmpq, 32
|
|
sub hd, 4
|
|
jg .hv_w4_loop
|
|
RET
|
|
.hv_w8:
|
|
pmullw xm0, xm4, [srcq+strideq*0+0]
|
|
pmullw xm1, xm5, [srcq+strideq*0+2]
|
|
psubw xm0, xm6
|
|
paddw xm0, xm1
|
|
psraw xm0, 2
|
|
vinserti32x4 m0, xm0, 3
|
|
.hv_w8_loop:
|
|
movu xm1, [srcq+strideq*1+0]
|
|
movu xm2, [srcq+strideq*1+2]
|
|
vinserti32x4 ym1, [srcq+strideq*2+0], 1
|
|
vinserti32x4 ym2, [srcq+strideq*2+2], 1
|
|
vinserti32x4 m1, [srcq+stride3q +0], 2
|
|
vinserti32x4 m2, [srcq+stride3q +2], 2
|
|
lea srcq, [srcq+strideq*4]
|
|
vinserti32x4 m1, [srcq+strideq*0+0], 3
|
|
vinserti32x4 m2, [srcq+strideq*0+2], 3
|
|
pmullw m1, m4
|
|
pmullw m2, m5
|
|
psubw m1, m6
|
|
paddw m1, m2
|
|
psraw m1, 2 ; 1 2 3 4
|
|
valignq m2, m1, m0, 6 ; 0 1 2 3
|
|
mova m0, m1
|
|
psubw m1, m2
|
|
pmulhrsw m1, m7
|
|
paddw m1, m2
|
|
mova [tmpq], m1
|
|
add tmpq, 64
|
|
sub hd, 4
|
|
jg .hv_w8_loop
|
|
RET
|
|
.hv_w16:
|
|
pmullw ym0, ym4, [srcq+strideq*0+0]
|
|
pmullw ym1, ym5, [srcq+strideq*0+2]
|
|
psubw ym0, ym6
|
|
paddw ym0, ym1
|
|
psraw ym0, 2
|
|
vinserti32x8 m0, ym0, 1
|
|
.hv_w16_loop:
|
|
movu ym1, [srcq+strideq*1+0]
|
|
movu ym2, [srcq+strideq*1+2]
|
|
lea srcq, [srcq+strideq*2]
|
|
vinserti32x8 m1, [srcq+strideq*0+0], 1
|
|
vinserti32x8 m2, [srcq+strideq*0+2], 1
|
|
pmullw m1, m4
|
|
pmullw m2, m5
|
|
psubw m1, m6
|
|
paddw m1, m2
|
|
psraw m1, 2 ; 1 2
|
|
vshufi32x4 m2, m0, m1, q1032 ; 0 1
|
|
mova m0, m1
|
|
psubw m1, m2
|
|
pmulhrsw m1, m7
|
|
paddw m1, m2
|
|
mova [tmpq], m1
|
|
add tmpq, 64
|
|
sub hd, 2
|
|
jg .hv_w16_loop
|
|
RET
|
|
.hv_w32:
|
|
pmullw m0, m4, [srcq+strideq*0+0]
|
|
pmullw m1, m5, [srcq+strideq*0+2]
|
|
psubw m0, m6
|
|
paddw m0, m1
|
|
psraw m0, 2
|
|
.hv_w32_loop:
|
|
pmullw m3, m4, [srcq+strideq*1+0]
|
|
pmullw m1, m5, [srcq+strideq*1+2]
|
|
lea srcq, [srcq+strideq*2]
|
|
psubw m3, m6
|
|
paddw m3, m1
|
|
psraw m3, 2
|
|
psubw m1, m3, m0
|
|
pmulhrsw m1, m7
|
|
paddw m1, m0
|
|
pmullw m0, m4, [srcq+strideq*0+0]
|
|
pmullw m2, m5, [srcq+strideq*0+2]
|
|
psubw m0, m6
|
|
paddw m0, m2
|
|
psraw m0, 2
|
|
psubw m2, m0, m3
|
|
pmulhrsw m2, m7
|
|
paddw m2, m3
|
|
mova [tmpq+64*0], m1
|
|
mova [tmpq+64*1], m2
|
|
add tmpq, 64*2
|
|
sub hd, 2
|
|
jg .hv_w32_loop
|
|
RET
|
|
.hv_w64:
|
|
pmullw m0, m4, [srcq+ 0]
|
|
pmullw m2, m5, [srcq+ 2]
|
|
pmullw m1, m4, [srcq+64]
|
|
pmullw m3, m5, [srcq+66]
|
|
psubw m0, m6
|
|
psubw m1, m6
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
psraw m0, 2
|
|
psraw m1, 2
|
|
.hv_w64_loop:
|
|
add srcq, strideq
|
|
pmullw m2, m4, [srcq+ 0]
|
|
pmullw m8, m5, [srcq+ 2]
|
|
pmullw m3, m4, [srcq+64]
|
|
pmullw m9, m5, [srcq+66]
|
|
psubw m2, m6
|
|
psubw m3, m6
|
|
paddw m2, m8
|
|
paddw m3, m9
|
|
psraw m2, 2
|
|
psraw m3, 2
|
|
psubw m8, m2, m0
|
|
psubw m9, m3, m1
|
|
pmulhrsw m8, m7
|
|
pmulhrsw m9, m7
|
|
paddw m8, m0
|
|
mova m0, m2
|
|
paddw m9, m1
|
|
mova m1, m3
|
|
mova [tmpq+64*0], m8
|
|
mova [tmpq+64*1], m9
|
|
add tmpq, 64*2
|
|
dec hd
|
|
jg .hv_w64_loop
|
|
RET
|
|
.hv_w128:
|
|
pmullw m0, m4, [srcq+ 0]
|
|
pmullw m8, m5, [srcq+ 2]
|
|
pmullw m1, m4, [srcq+ 64]
|
|
pmullw m9, m5, [srcq+ 66]
|
|
pmullw m2, m4, [srcq+128]
|
|
pmullw m10, m5, [srcq+130]
|
|
pmullw m3, m4, [srcq+192]
|
|
pmullw m11, m5, [srcq+194]
|
|
REPX {psubw x, m6}, m0, m1, m2, m3
|
|
paddw m0, m8
|
|
paddw m1, m9
|
|
paddw m2, m10
|
|
paddw m3, m11
|
|
REPX {psraw x, 2}, m0, m1, m2, m3
|
|
.hv_w128_loop:
|
|
add srcq, strideq
|
|
pmullw m8, m4, [srcq+ 0]
|
|
pmullw m12, m5, [srcq+ 2]
|
|
pmullw m9, m4, [srcq+ 64]
|
|
pmullw m13, m5, [srcq+ 66]
|
|
pmullw m10, m4, [srcq+128]
|
|
pmullw m14, m5, [srcq+130]
|
|
pmullw m11, m4, [srcq+192]
|
|
pmullw m15, m5, [srcq+194]
|
|
REPX {psubw x, m6}, m8, m9, m10, m11
|
|
paddw m8, m12
|
|
paddw m9, m13
|
|
paddw m10, m14
|
|
paddw m11, m15
|
|
REPX {psraw x, 2}, m8, m9, m10, m11
|
|
psubw m12, m8, m0
|
|
psubw m13, m9, m1
|
|
psubw m14, m10, m2
|
|
psubw m15, m11, m3
|
|
REPX {pmulhrsw x, m7}, m12, m13, m14, m15
|
|
paddw m12, m0
|
|
mova m0, m8
|
|
paddw m13, m1
|
|
mova m1, m9
|
|
mova [tmpq+64*0], m12
|
|
mova [tmpq+64*1], m13
|
|
paddw m14, m2
|
|
mova m2, m10
|
|
paddw m15, m3
|
|
mova m3, m11
|
|
mova [tmpq+64*2], m14
|
|
mova [tmpq+64*3], m15
|
|
add tmpq, 64*4
|
|
dec hd
|
|
jg .hv_w128_loop
|
|
RET
|
|
|
|
; int8_t subpel_filters[5][15][8]
|
|
%assign FILTER_REGULAR (0*15 << 16) | 3*15
|
|
%assign FILTER_SMOOTH (1*15 << 16) | 4*15
|
|
%assign FILTER_SHARP (2*15 << 16) | 3*15
|
|
|
|
%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
|
|
cglobal %1_%2_16bpc
|
|
mov t0d, FILTER_%3
|
|
%ifidn %3, %4
|
|
mov t1d, t0d
|
|
%else
|
|
mov t1d, FILTER_%4
|
|
%endif
|
|
%if %0 == 5 ; skip the jump in the last filter
|
|
jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
|
|
%endif
|
|
%endmacro
|
|
|
|
%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v
|
|
cglobal %1_8tap_%2_16bpc
|
|
mov t0d, FILTER_%3
|
|
%ifidn %3, %4
|
|
mov t1d, t0d
|
|
%else
|
|
mov t1d, FILTER_%4
|
|
%endif
|
|
%ifnidn %2, regular ; skip the jump in the last filter
|
|
jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX)
|
|
%endif
|
|
%endmacro
|
|
|
|
%if WIN64
|
|
DECLARE_REG_TMP 4, 5
|
|
%define buf rsp+stack_offset+8 ; shadow space
|
|
%else
|
|
DECLARE_REG_TMP 7, 8
|
|
%define buf rsp-40 ; red zone
|
|
%endif
|
|
|
|
%define PUT_8TAP_FN FN put_8tap,
|
|
PUT_8TAP_FN smooth, SMOOTH, SMOOTH, put_6tap_16bpc
|
|
PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR, put_6tap_16bpc
|
|
PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH, put_6tap_16bpc
|
|
PUT_8TAP_FN regular, REGULAR, REGULAR
|
|
|
|
cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
|
|
%define base r8-put_avx512icl
|
|
imul mxd, mxm, 0x010101
|
|
add mxd, t0d ; 6tap_h, mx, 4tap_h
|
|
imul myd, mym, 0x010101
|
|
add myd, t1d ; 6tap_v, my, 4tap_v
|
|
lea r8, [put_avx512icl]
|
|
movifnidn wd, wm
|
|
movifnidn hd, hm
|
|
test mxd, 0xf00
|
|
jnz .h
|
|
test myd, 0xf00
|
|
jnz .v
|
|
.put:
|
|
tzcnt wd, wd
|
|
movzx wd, word [r8+wq*2+table_offset(put,)]
|
|
add wq, r8
|
|
%if WIN64
|
|
pop r8
|
|
%endif
|
|
jmp wq
|
|
.h_w8:
|
|
mova m4, [spel_h_shufA]
|
|
movu m5, [spel_h_shufB]
|
|
movu m6, [spel_h_shufC]
|
|
.h_w8_loop:
|
|
movu ym2, [srcq+ssq*0]
|
|
vinserti32x8 m2, [srcq+ssq*1], 1
|
|
lea srcq, [srcq+ssq*2]
|
|
mova m0, m8
|
|
vpermb m1, m4, m2
|
|
vpdpwssd m0, m10, m1
|
|
vpermb m1, m5, m2
|
|
vpdpwssd m0, m11, m1
|
|
vpermb m1, m6, m2
|
|
vpdpwssd m0, m12, m1
|
|
psrad m0, 6
|
|
vextracti32x8 ym1, m0, 1
|
|
packusdw ym0, ym1
|
|
pminsw ym0, ym15
|
|
mova [dstq+dsq*0], xm0
|
|
vextracti32x4 [dstq+dsq*1], ym0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .h_w8_loop
|
|
RET
|
|
.h:
|
|
vpbroadcastw m15, r8m
|
|
test myd, 0xf00
|
|
jnz .hv
|
|
mov r7d, r8m
|
|
shr r7d, 11
|
|
vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4]
|
|
cmp wd, 4
|
|
jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4
|
|
shr mxd, 16
|
|
sub srcq, 4
|
|
pmovsxbw xmm0, [base+subpel_filters+1+mxq*8]
|
|
mova [buf], xmm0
|
|
vpbroadcastd m10, xmm0
|
|
vpbroadcastd m12, [buf+8]
|
|
vpbroadcastd m11, [buf+4]
|
|
sub wd, 16
|
|
jl .h_w8
|
|
vbroadcasti32x4 m6, [spel_h_shufA]
|
|
vbroadcasti32x4 m7, [spel_h_shufB]
|
|
jg .h_w32
|
|
.h_w16_loop:
|
|
movu ym2, [srcq+ssq*0+ 0]
|
|
vinserti32x8 m2, [srcq+ssq*1+ 0], 1
|
|
movu ym3, [srcq+ssq*0+12]
|
|
vinserti32x8 m3, [srcq+ssq*1+12], 1
|
|
lea srcq, [srcq+ssq*2]
|
|
mova m0, m8
|
|
mova m1, m8
|
|
pshufb m4, m2, m6
|
|
vpdpwssd m0, m10, m4 ; a0 b0
|
|
pshufb m4, m3, m7
|
|
vpdpwssd m1, m12, m4 ; a2' b2'
|
|
pshufb m2, m7
|
|
pshufb m3, m6
|
|
vpdpwssd m0, m11, m2 ; a1 b1
|
|
vpdpwssd m1, m11, m3 ; a1' b1'
|
|
shufpd m2, m3, 0x55
|
|
vpdpwssd m0, m12, m2 ; a2 b2
|
|
vpdpwssd m1, m10, m2 ; a0' b0'
|
|
psrad m0, 6
|
|
psrad m1, 6
|
|
packusdw m0, m1
|
|
pminsw m0, m15
|
|
mova [dstq+dsq*0], ym0
|
|
vextracti32x8 [dstq+dsq*1], m0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .h_w16_loop
|
|
RET
|
|
.h_w32:
|
|
lea srcq, [srcq+wq*2]
|
|
lea dstq, [dstq+wq*2]
|
|
neg wq
|
|
.h_w32_loop0:
|
|
mov r6, wq
|
|
.h_w32_loop:
|
|
movu m2, [srcq+r6*2+ 0]
|
|
movu m3, [srcq+r6*2+12]
|
|
mova m0, m8
|
|
mova m1, m8
|
|
pshufb m4, m2, m6
|
|
vpdpwssd m0, m10, m4 ; a0
|
|
pshufb m4, m3, m7
|
|
vpdpwssd m1, m12, m4 ; b2
|
|
pshufb m2, m7
|
|
pshufb m3, m6
|
|
vpdpwssd m0, m11, m2 ; a1
|
|
vpdpwssd m1, m11, m3 ; b1
|
|
shufpd m2, m3, 0x55
|
|
vpdpwssd m0, m12, m2 ; a2
|
|
vpdpwssd m1, m10, m2 ; b0
|
|
psrad m0, 6
|
|
psrad m1, 6
|
|
packusdw m0, m1
|
|
pminsw m0, m15
|
|
mova [dstq+r6*2], m0
|
|
add r6, 32
|
|
jl .h_w32_loop
|
|
add srcq, ssq
|
|
add dstq, dsq
|
|
dec hd
|
|
jg .h_w32_loop0
|
|
RET
|
|
.v:
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 6
|
|
cmovs myd, mxd
|
|
vpbroadcastd m11, [pd_32]
|
|
pmovsxbw xmm0, [base+subpel_filters+1+myq*8]
|
|
tzcnt r7d, wd
|
|
vpbroadcastw m15, r8m
|
|
mov r6, ssq
|
|
movzx r7d, word [r8+r7*2+table_offset(put, _6tap_v)]
|
|
neg r6
|
|
mova [rsp+stack_offset+8], xmm0
|
|
vpbroadcastd m12, xmm0
|
|
add r7, r8
|
|
vpbroadcastd m13, [rsp+stack_offset+12]
|
|
vpbroadcastd m14, [rsp+stack_offset+16]
|
|
jmp r7
|
|
.v_w2:
|
|
movd xmm2, [srcq+r6 *2]
|
|
pinsrd xmm2, [srcq+r6 *1], 1
|
|
pinsrd xmm2, [srcq+ssq*0], 2
|
|
pinsrd xmm2, [srcq+ssq*1], 3 ; 0 1 2 3
|
|
lea srcq, [srcq+ssq*2]
|
|
movd xmm0, [srcq+ssq*0]
|
|
palignr xmm3, xmm0, xmm2, 4 ; 1 2 3 4
|
|
punpcklwd xmm1, xmm2, xmm3 ; 01 12
|
|
punpckhwd xmm2, xmm3 ; 23 34
|
|
.v_w2_loop:
|
|
movd xmm3, [srcq+ssq*1]
|
|
mova xmm4, xm11
|
|
vpdpwssd xmm4, xmm1, xm12 ; a0 b0
|
|
lea srcq, [srcq+ssq*2]
|
|
mova xmm1, xmm2
|
|
vpdpwssd xmm4, xmm2, xm13 ; a1 b1
|
|
punpckldq xmm2, xmm0, xmm3 ; 4 5
|
|
movd xmm0, [srcq+ssq*0]
|
|
punpckldq xmm3, xmm0 ; 5 6
|
|
punpcklwd xmm2, xmm3 ; 45 56
|
|
vpdpwssd xmm4, xmm2, xm14 ; a2 b2
|
|
psrad xmm4, 6
|
|
packusdw xmm4, xmm4
|
|
pminsw xmm4, xm15
|
|
movd [dstq+dsq*0], xmm4
|
|
pextrd [dstq+dsq*1], xmm4, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w2_loop
|
|
RET
|
|
.v_w4:
|
|
movq xmm1, [srcq+r6 *2]
|
|
vpbroadcastq ymm3, [srcq+r6 *1]
|
|
vpbroadcastq ymm2, [srcq+ssq*0]
|
|
vpbroadcastq ymm4, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vpbroadcastq ymm0, [srcq+ssq*0]
|
|
vpblendd ymm1, ymm3, 0x30
|
|
vpblendd ymm3, ymm2, 0x30
|
|
punpcklwd ymm1, ymm3 ; 01 12
|
|
vpblendd ymm2, ymm4, 0x30
|
|
vpblendd ymm4, ymm0, 0x30
|
|
punpcklwd ymm2, ymm4 ; 23 34
|
|
.v_w4_loop:
|
|
vpbroadcastq ymm3, [srcq+ssq*1]
|
|
mova ymm4, ym11
|
|
vpdpwssd ymm4, ymm1, ym12 ; a0 b0
|
|
lea srcq, [srcq+ssq*2]
|
|
mova ymm1, ymm2
|
|
vpdpwssd ymm4, ymm2, ym13 ; a1 b1
|
|
vpblendd ymm2, ymm0, ymm3, 0x30
|
|
vpbroadcastq ymm0, [srcq+ssq*0]
|
|
vpblendd ymm3, ymm0, 0x30
|
|
punpcklwd ymm2, ymm3 ; 45 56
|
|
vpdpwssd ymm4, ymm2, ym14 ; a2 b2
|
|
psrad ymm4, 6
|
|
vextracti128 xmm3, ymm4, 1
|
|
packusdw xmm4, xmm3
|
|
pminsw xmm4, xm15
|
|
movq [dstq+dsq*0], xmm4
|
|
movhps [dstq+dsq*1], xmm4
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w4_loop
|
|
vzeroupper
|
|
RET
|
|
.v_w8:
|
|
vbroadcasti32x4 m0, [srcq+ssq*0]
|
|
vinserti32x4 m1, m0, [srcq+r6 *2], 0
|
|
vinserti32x4 m1, [srcq+r6 *1], 1 ; 0 1 2
|
|
vinserti32x4 ym0, [srcq+ssq*1], 1
|
|
lea srcq, [srcq+ssq*2]
|
|
mova m5, [spel_v_shuf8]
|
|
vinserti32x4 m0, [srcq+ssq*0], 2 ; 2 3 4
|
|
vpermb m1, m5, m1 ; 01 12
|
|
vpermb m2, m5, m0 ; 23 34
|
|
.v_w8_loop:
|
|
vinserti32x4 m0, [srcq+ssq*1], 3
|
|
lea srcq, [srcq+ssq*2]
|
|
movu xm3, [srcq+ssq*0]
|
|
mova m4, m11
|
|
vpdpwssd m4, m12, m1 ; a0 b0
|
|
vshufi32x4 m0, m3, q1032 ; 4 5 6
|
|
mova m1, m2
|
|
vpdpwssd m4, m13, m2 ; a1 b1
|
|
vpermb m2, m5, m0 ; 45 56
|
|
vpdpwssd m4, m14, m2 ; a2 b2
|
|
psrad m4, 6
|
|
vextracti32x8 ym3, m4, 1
|
|
packusdw ym4, ym3
|
|
pminsw ym4, ym15
|
|
mova [dstq+dsq*0], xm4
|
|
vextracti32x4 [dstq+dsq*1], ym4, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w8_loop
|
|
RET
|
|
.v_w16:
|
|
vbroadcasti32x8 m0, [srcq+r6 *1]
|
|
vinserti32x8 m1, m0, [srcq+ssq*0], 1
|
|
vinserti32x8 m0, [srcq+r6*2], 0
|
|
mova m6, [spel_v_shuf16]
|
|
movu ym3, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m3, [srcq+ssq*0], 1
|
|
vpermb m1, m6, m1 ; 12
|
|
vpermb m0, m6, m0 ; 01
|
|
vpermb m3, m6, m3 ; 34
|
|
mova m7, [deint_q_shuf]
|
|
vpshrdd m2, m1, m3, 16 ; 23
|
|
.v_w16_loop:
|
|
mova m5, m11
|
|
vpdpwssd m5, m12, m1 ; b0
|
|
mova m4, m11
|
|
vpdpwssd m4, m12, m0 ; a0
|
|
mova m1, m3
|
|
vpdpwssd m5, m13, m3 ; b1
|
|
mova m0, m2
|
|
vpdpwssd m4, m13, m2 ; a1
|
|
movu ym3, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m3, [srcq+ssq*0], 1
|
|
vpermb m3, m6, m3 ; 56
|
|
vpshrdd m2, m1, m3, 16 ; 45
|
|
vpdpwssd m5, m14, m3 ; b2
|
|
vpdpwssd m4, m14, m2 ; a2
|
|
psrad m5, 6
|
|
psrad m4, 6
|
|
packusdw m4, m5
|
|
pminsw m4, m15
|
|
vpermq m4, m7, m4
|
|
mova [dstq+dsq*0], ym4
|
|
vextracti32x8 [dstq+dsq*1], m4, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w16_loop
|
|
RET
|
|
.v_w32:
|
|
.v_w64:
|
|
.v_w128:
|
|
lea wd, [hq+wq*8-256]
|
|
.v_w32_loop0:
|
|
movu m16, [srcq+r6 *2]
|
|
movu m17, [srcq+r6 *1]
|
|
lea r7, [srcq+ssq*2]
|
|
movu m18, [srcq+ssq*0]
|
|
movu m19, [srcq+ssq*1]
|
|
mov r8, dstq
|
|
movu m20, [r7 +ssq*0]
|
|
punpcklwd m0, m16, m17 ; 01
|
|
punpckhwd m16, m17
|
|
punpcklwd m1, m17, m18 ; 12
|
|
punpckhwd m17, m18
|
|
punpcklwd m2, m18, m19 ; 23
|
|
punpckhwd m18, m19
|
|
punpcklwd m3, m19, m20 ; 34
|
|
punpckhwd m19, m20
|
|
.v_w32_loop:
|
|
mova m4, m11
|
|
vpdpwssd m4, m12, m0 ; a0
|
|
mova m6, m11
|
|
vpdpwssd m6, m12, m16
|
|
mova m5, m11
|
|
vpdpwssd m5, m12, m1 ; b0
|
|
mova m7, m11
|
|
vpdpwssd m7, m12, m17
|
|
mova m0, m2
|
|
vpdpwssd m4, m13, m2 ; a1
|
|
mova m16, m18
|
|
vpdpwssd m6, m13, m18
|
|
mova m1, m3
|
|
vpdpwssd m5, m13, m3 ; b1
|
|
mova m17, m19
|
|
vpdpwssd m7, m13, m19
|
|
movu m19, [r7+ssq*1]
|
|
lea r7, [r7+ssq*2]
|
|
punpcklwd m2, m20, m19 ; 45
|
|
punpckhwd m18, m20, m19
|
|
movu m20, [r7+ssq*0]
|
|
vpdpwssd m4, m14, m2 ; a2
|
|
vpdpwssd m6, m14, m18
|
|
punpcklwd m3, m19, m20 ; 56
|
|
punpckhwd m19, m20
|
|
vpdpwssd m5, m14, m3 ; b2
|
|
vpdpwssd m7, m14, m19
|
|
REPX {psrad x, 6}, m4, m6, m5, m7
|
|
packusdw m4, m6
|
|
packusdw m5, m7
|
|
pminsw m4, m15
|
|
pminsw m5, m15
|
|
mova [r8+dsq*0], m4
|
|
mova [r8+dsq*1], m5
|
|
lea r8, [r8+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w32_loop
|
|
add srcq, 64
|
|
add dstq, 64
|
|
movzx hd, wb
|
|
sub wd, 1<<8
|
|
jg .v_w32_loop0
|
|
vzeroupper
|
|
RET
|
|
.hv:
|
|
cmp wd, 4
|
|
jg .hv_w8
|
|
movzx mxd, mxb
|
|
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 6
|
|
cmovs myd, mxd
|
|
pmovsxbw xmm1, [base+subpel_filters+1+myq*8]
|
|
mov r6, ssq
|
|
sub srcq, 2
|
|
neg r6
|
|
test dword r8m, 0x800
|
|
jnz .hv_12bit
|
|
vpbroadcastd m10, [pd_2176]
|
|
psllw xmm0, 6
|
|
jmp .hv_main
|
|
.hv_12bit:
|
|
vpbroadcastd m10, [pd_640]
|
|
psllw xmm0, 4
|
|
psllw xmm1, 2
|
|
.hv_main:
|
|
movu xm4, [srcq+r6 *2]
|
|
vinserti32x4 ym4, [srcq+r6 *1], 1
|
|
vinserti32x4 m4, [srcq+ssq*0], 2
|
|
vbroadcasti32x4 m6, [spel_h_shufA]
|
|
vinserti32x4 m4, [srcq+ssq*1], 3 ; 0 1 2 3
|
|
lea srcq, [srcq+ssq*2]
|
|
movu xm5, [srcq+ssq*0] ; 4
|
|
mova [buf+ 0], xmm0
|
|
mova [buf+16], xmm1
|
|
vpbroadcastd m8, [buf+ 4]
|
|
vpbroadcastd m9, [buf+ 8]
|
|
vpbroadcastd ym12, xmm1
|
|
vpbroadcastd ym13, [buf+20]
|
|
vpbroadcastd ym14, [buf+24]
|
|
cmp wd, 4
|
|
je .hv_w4
|
|
vbroadcasti32x4 m2, [spel_h_shufA]
|
|
mova m3, [spel_h_shuf2b]
|
|
mova m1, m10
|
|
pshufb m4, m6
|
|
pshufb xm5, xm6
|
|
punpcklqdq m2, m4, m5
|
|
vpdpwssd m1, m8, m2 ; 04 1_ 2_ 3_
|
|
mova ym6, [spel_h_shuf2a]
|
|
punpckhqdq m4, m5
|
|
mova xm5, [spel_shuf2]
|
|
vpdpwssd m1, m9, m4
|
|
vpermb m1, m3, m1 ; 01 12
|
|
vextracti32x4 xm2, ym1, 1 ; 23 34
|
|
.hv_w2_loop:
|
|
movu xm3, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x4 ym3, [srcq+ssq*0], 1
|
|
vpermb ym3, ym6, ym3
|
|
pmaddwd xmm0, xm12, xm1 ; a0 b0
|
|
mova xm4, xm10
|
|
vpdpwssd xm4, xm8, xm3
|
|
vextracti32x4 xm3, ym3, 1
|
|
mova xm1, xm2
|
|
vpdpwssd xmm0, xm13, xm2 ; a1 b1
|
|
vpdpwssd xm4, xm9, xm3 ; 5 6
|
|
vpermt2b xm2, xm5, xm4 ; 45 56
|
|
vpdpwssd xmm0, xm14, xm2 ; a2 b2
|
|
psrad xmm0, 10
|
|
packusdw xmm0, xmm0
|
|
pminsw xmm0, xm15
|
|
movd [dstq+dsq*0], xmm0
|
|
pextrd [dstq+dsq*1], xmm0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w2_loop
|
|
RET
|
|
.hv_w4:
|
|
vbroadcasti32x4 m7, [spel_h_shufB]
|
|
mova ym0, [spel_shuf4a]
|
|
pshufb m1, m4, m6
|
|
mova m2, m10
|
|
vpdpwssd m2, m8, m1
|
|
pshufb xm1, xm5, xm6
|
|
mova xm3, xm10
|
|
vpdpwssd xm3, xm8, xm1
|
|
pshufb m4, m7
|
|
pshufb xm5, xm7
|
|
vpdpwssd m2, m9, m4 ; 0 1 2 3
|
|
vpdpwssd xm3, xm9, xm5 ; 4
|
|
mova ym5, [spel_shuf4b]
|
|
vpermb m1, m0, m2 ; 01 12
|
|
vshufi32x4 m2, m3, q1032 ; 2 3 4
|
|
vpermb m2, m0, m2 ; 23 34
|
|
.hv_w4_loop:
|
|
movu xm3, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x4 ym3, [srcq+ssq*0], 1
|
|
pmaddwd ym0, ym12, ym1 ; a0 b0
|
|
mova ym1, ym2
|
|
pshufb ym4, ym3, ym6
|
|
mova ym2, ym10
|
|
vpdpwssd ym2, ym8, ym4
|
|
pshufb ym3, ym7
|
|
vpdpwssd ym0, ym13, ym1 ; a1 b1
|
|
vpdpwssd ym2, ym9, ym3 ; 5 6
|
|
vpermt2b ym2, ym5, ym1 ; 45 56
|
|
vpdpwssd ym0, ym14, ym2 ; a2 b2
|
|
psrad ym0, 10
|
|
vextracti32x4 xm4, ym0, 1
|
|
packusdw xm0, xm4
|
|
pminsw xmm0, xm0, xm15
|
|
movq [dstq+dsq*0], xmm0
|
|
movhps [dstq+dsq*1], xmm0
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w4_loop
|
|
RET
|
|
.hv_w8:
|
|
shr mxd, 16
|
|
pmovsxbw xmm0, [base+subpel_filters+1+mxq*8]
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 6
|
|
cmovs myd, mxd
|
|
pmovsxbw xmm1, [base+subpel_filters+1+myq*8]
|
|
mov r6, ssq
|
|
sub srcq, 4
|
|
neg r6
|
|
test dword r8m, 0x800
|
|
jnz .hv_w8_12bit
|
|
vpbroadcastd m8, [pd_2176]
|
|
psllw xmm0, 6
|
|
jmp .hv_w8_main
|
|
.hv_w8_12bit:
|
|
vpbroadcastd m8, [pd_640]
|
|
psllw xmm0, 4
|
|
psllw xmm1, 2
|
|
.hv_w8_main:
|
|
mova [buf+ 0], xmm0
|
|
mova [buf+16], xmm1
|
|
vpbroadcastd m9, xmm0
|
|
vpbroadcastd m10, [buf+ 4]
|
|
vpbroadcastd m11, [buf+ 8]
|
|
vpbroadcastd m12, xmm1
|
|
vpbroadcastd m13, [buf+20]
|
|
vpbroadcastd m14, [buf+24]
|
|
cmp wd, 16
|
|
jge .hv_w16
|
|
mova m6, [spel_h_shufA]
|
|
movu ym16, [srcq+r6 *2]
|
|
vinserti32x8 m16, [srcq+r6 *1], 1 ; 0 1
|
|
movu ym17, [srcq+ssq*0]
|
|
vinserti32x8 m17, [srcq+ssq*1], 1 ; 2 3
|
|
lea srcq, [srcq+ssq*2]
|
|
movu ym18, [srcq+ssq*0] ; 4
|
|
movu m7, [spel_h_shufC]
|
|
vpermb m3, m6, m16
|
|
mova m1, m8
|
|
vpermb m4, m6, m17
|
|
vpdpwssd m1, m9, m3 ; a0 b0
|
|
mova m2, m8
|
|
vpermb m5, m6, m18
|
|
vpdpwssd m2, m9, m4 ; c0 d0
|
|
mova m0, m8
|
|
vpermb m16, m7, m16
|
|
vpdpwssd m0, m9, m5 ; e0
|
|
vpermb m17, m7, m17
|
|
vpdpwssd m1, m11, m16 ; a2 b2
|
|
vpermb m18, m7, m18
|
|
vpdpwssd m2, m11, m17 ; c2 d2
|
|
shufpd m3, m16, 0x55
|
|
vpdpwssd m0, m11, m18 ; e2
|
|
mova m16, [spel_shuf8a]
|
|
shufpd m4, m17, 0x55
|
|
vpdpwssd m1, m10, m3 ; a1 b1
|
|
shufpd m5, m18, 0x55
|
|
vpdpwssd m2, m10, m4 ; c1 d1
|
|
vpdpwssd m0, m10, m5 ; e1
|
|
mova m5, [spel_shuf8b]
|
|
vpermt2b m1, m16, m2 ; 01 12
|
|
vpermt2b m2, m16, m0 ; 23 34
|
|
.hv_w8_loop:
|
|
movu ym18, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m18, [srcq+ssq*0], 1
|
|
mova m0, m8
|
|
vpermb m17, m6, m18
|
|
vpdpwssd m0, m9, m17 ; f0 g0
|
|
vpermb m18, m7, m18
|
|
pmaddwd m16, m12, m1 ; A0 B0
|
|
vpdpwssd m0, m11, m18 ; f2 g2
|
|
shufpd m17, m18, 0x55
|
|
mova m1, m2
|
|
vpdpwssd m16, m13, m2 ; A1 B1
|
|
vpdpwssd m0, m10, m17 ; f1 g1
|
|
vpermt2b m2, m5, m0 ; 45 56
|
|
vpdpwssd m16, m14, m2 ; A2 B2
|
|
psrad m16, 10
|
|
vextracti32x8 ym17, m16, 1
|
|
packusdw ym16, ym17
|
|
pminsw ym16, ym15
|
|
mova [dstq+dsq*0], xm16
|
|
vextracti128 [dstq+dsq*1], ym16, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w8_loop
|
|
vzeroupper
|
|
RET
|
|
.hv_w16:
|
|
vbroadcasti32x4 m20, [spel_h_shufA]
|
|
vbroadcasti32x4 m21, [spel_h_shufB]
|
|
jg .hv_w32
|
|
vbroadcasti32x8 m6, [srcq+r6 *2+ 8]
|
|
vinserti32x8 m2, m6, [srcq+r6 *2+16], 1
|
|
vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0
|
|
movu ym16, [srcq+r6 *1+ 0]
|
|
movu ym17, [srcq+r6 *1+12]
|
|
vinserti32x8 m16, [srcq+ssq*0+ 0], 1
|
|
vinserti32x8 m17, [srcq+ssq*0+12], 1 ; 1 2
|
|
movu ym18, [srcq+ssq*1+ 0]
|
|
movu ym19, [srcq+ssq*1+12]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m18, [srcq+ssq*0+ 0], 1
|
|
vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 3 4
|
|
pshufb m2, m20
|
|
mova m1, m8
|
|
pshufb m3, m16, m20
|
|
vpdpwssd m1, m11, m2 ; a2
|
|
mova m2, m8
|
|
pshufb m4, m17, m21
|
|
vpdpwssd m2, m9, m3 ; b0 c0
|
|
mova m3, m8
|
|
pshufb m5, m18, m20
|
|
vpdpwssd m3, m11, m4 ; b2' c2'
|
|
mova m4, m8
|
|
pshufb m7, m19, m21
|
|
vpdpwssd m4, m9, m5 ; d0 e0
|
|
mova m5, m8
|
|
pshufb m0, m6, m20
|
|
vpdpwssd m5, m11, m7 ; d2' e2'
|
|
mova m7, [spel_shuf16]
|
|
pshufb m16, m21
|
|
vpdpwssd m1, m9, m0 ; a0
|
|
pshufb m17, m20
|
|
vpdpwssd m2, m10, m16 ; b1 c1
|
|
pshufb m18, m21
|
|
vpdpwssd m3, m10, m17 ; b1' c1'
|
|
pshufb m19, m20
|
|
vpdpwssd m4, m10, m18 ; d1 e1
|
|
pshufb m6, m21
|
|
vpdpwssd m5, m10, m19 ; d1' e1'
|
|
shufpd m16, m17, 0x55
|
|
vpdpwssd m1, m10, m6 ; a1
|
|
shufpd m18, m19, 0x55
|
|
vpdpwssd m2, m11, m16 ; b2 c2
|
|
vpdpwssd m3, m9, m16 ; b0' c0'
|
|
vpdpwssd m4, m11, m18 ; d2 e2
|
|
vpdpwssd m5, m9, m18 ; d0' e0'
|
|
pslldq m1, 1
|
|
vpermt2b m2, m7, m3 ; 12
|
|
vpermt2b m4, m7, m5 ; 34
|
|
vpshrdd m1, m2, 16 ; 01
|
|
vpshrdd m3, m2, m4, 16 ; 23
|
|
.hv_w16_loop:
|
|
movu ym18, [srcq+ssq*1+ 0]
|
|
movu ym19, [srcq+ssq*1+12]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m18, [srcq+ssq*0+ 0], 1
|
|
vinserti32x8 m19, [srcq+ssq*0+12], 1
|
|
mova m5, m8
|
|
mova m6, m8
|
|
pshufb m17, m18, m20
|
|
vpdpwssd m5, m9, m17 ; f0 g0
|
|
pshufb m16, m19, m21
|
|
vpdpwssd m6, m11, m16 ; f2' g2'
|
|
pmaddwd m17, m12, m2 ; B0
|
|
mova m2, m4
|
|
pmaddwd m16, m12, m1 ; A0
|
|
mova m1, m3
|
|
pshufb m18, m21
|
|
vpdpwssd m5, m10, m18 ; f1 g1
|
|
pshufb m19, m20
|
|
vpdpwssd m6, m10, m19 ; f1' g1'
|
|
vpdpwssd m17, m13, m4 ; B1
|
|
vpdpwssd m16, m13, m3 ; A1
|
|
shufpd m18, m19, 0x55
|
|
vpdpwssd m5, m11, m18 ; f2 g2
|
|
vpdpwssd m6, m9, m18 ; f0' g0'
|
|
mova m4, m7
|
|
vpermi2b m4, m5, m6 ; 56
|
|
vpshrdd m3, m2, m4, 16 ; 45
|
|
vpdpwssd m17, m14, m4 ; B2
|
|
vpdpwssd m16, m14, m3 ; A2
|
|
psrad m16, 10
|
|
psrad m17, 10
|
|
vshufi32x4 m18, m16, m17, q3232
|
|
vinserti32x8 m16, ym17, 1
|
|
packusdw m16, m18
|
|
pminsw m16, m15
|
|
mova [dstq+dsq*0], ym16
|
|
vextracti32x8 [dstq+dsq*1], m16, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w16_loop
|
|
vzeroupper
|
|
RET
|
|
.hv_w32:
|
|
WIN64_SPILL_XMM 28
|
|
mova m27, [spel_shuf32]
|
|
lea wd, [hq+wq*8-256]
|
|
.hv_w32_loop0:
|
|
movu m16, [srcq+r6 *2+ 0]
|
|
movu m7, [srcq+r6 *2+12]
|
|
movu m6, [srcq+r6 *1+ 0]
|
|
movu m18, [srcq+r6 *1+12]
|
|
lea r7, [srcq+ssq*2]
|
|
movu m17, [srcq+ssq*0+ 0]
|
|
movu m19, [srcq+ssq*0+12]
|
|
movu m22, [srcq+ssq*1+ 0]
|
|
movu m24, [srcq+ssq*1+12]
|
|
mov r8, dstq
|
|
movu m23, [r7 +ssq*0+ 0]
|
|
movu m25, [r7 +ssq*0+12]
|
|
pshufb m1, m16, m20
|
|
mova m0, m8
|
|
pshufb m2, m7, m21
|
|
vpdpwssd m0, m9, m1 ; a0
|
|
mova m1, m8
|
|
pshufb m4, m6, m20
|
|
vpdpwssd m1, m11, m2 ; a2'
|
|
mova m2, m8
|
|
pshufb m3, m17, m20
|
|
vpdpwssd m2, m9, m4 ; b0
|
|
mova m4, m8
|
|
pshufb m5, m18, m21
|
|
vpdpwssd m4, m9, m3 ; c0
|
|
mova m3, m8
|
|
pshufb m26, m19, m21
|
|
vpdpwssd m3, m11, m5 ; b2'
|
|
mova m5, m8
|
|
pshufb m16, m21
|
|
vpdpwssd m5, m11, m26 ; c2'
|
|
pshufb m7, m20
|
|
vpdpwssd m0, m10, m16 ; a1
|
|
pshufb m6, m21
|
|
vpdpwssd m1, m10, m7 ; a1'
|
|
pshufb m17, m21
|
|
vpdpwssd m2, m10, m6 ; b1
|
|
pshufb m18, m20
|
|
vpdpwssd m4, m10, m17 ; c1
|
|
pshufb m19, m20
|
|
vpdpwssd m3, m10, m18 ; b1'
|
|
shufpd m16, m7, 0x55
|
|
vpdpwssd m5, m10, m19 ; c1'
|
|
shufpd m6, m18, 0x55
|
|
vpdpwssd m0, m11, m16 ; a2
|
|
shufpd m17, m19, 0x55
|
|
vpdpwssd m1, m9, m16 ; a0'
|
|
pshufb m16, m22, m20
|
|
vpdpwssd m2, m11, m6 ; b2
|
|
pshufb m7, m23, m20
|
|
vpdpwssd m4, m11, m17 ; c2
|
|
vpdpwssd m3, m9, m6 ; b0'
|
|
mova m6, m8
|
|
vpdpwssd m5, m9, m17 ; c0'
|
|
pshufb m17, m24, m21
|
|
vpdpwssd m6, m9, m16 ; d0
|
|
mova m16, m8
|
|
pshufb m26, m25, m21
|
|
vpdpwssd m16, m9, m7 ; e0
|
|
mova m7, m8
|
|
pshufb m22, m21
|
|
vpdpwssd m7, m11, m17 ; d2'
|
|
mova m17, m8
|
|
pshufb m23, m21
|
|
vpdpwssd m17, m11, m26 ; e2'
|
|
pshufb m24, m20
|
|
vpdpwssd m6, m10, m22 ; d1
|
|
pshufb m25, m20
|
|
vpdpwssd m16, m10, m23 ; e1
|
|
shufpd m22, m24, 0x55
|
|
vpdpwssd m7, m10, m24 ; d1'
|
|
shufpd m23, m25, 0x55
|
|
vpdpwssd m17, m10, m25 ; e1'
|
|
pslldq m0, 1
|
|
vpdpwssd m6, m11, m22 ; d2
|
|
pslldq m1, 1
|
|
vpdpwssd m16, m11, m23 ; e2
|
|
vpermt2b m2, m27, m4 ; 12
|
|
vpdpwssd m7, m9, m22 ; d0'
|
|
vpermt2b m3, m27, m5 ; 12'
|
|
vpdpwssd m17, m9, m23 ; e0'
|
|
vpshrdd m0, m2, 16 ; 01
|
|
vpermt2b m6, m27, m16 ; 34
|
|
vpshrdd m1, m3, 16 ; 01'
|
|
vpermt2b m7, m27, m17 ; 34'
|
|
vpshrdd m4, m2, m6, 16 ; 23
|
|
vpshrdd m5, m3, m7, 16 ; 23'
|
|
.hv_w32_loop:
|
|
movu m22, [r7+ssq*1+ 0]
|
|
movu m24, [r7+ssq*1+12]
|
|
lea r7, [r7+ssq*2]
|
|
movu m23, [r7+ssq*0+ 0]
|
|
movu m25, [r7+ssq*0+12]
|
|
pmaddwd m17, m12, m2 ; B0
|
|
mova m2, m6
|
|
pmaddwd m19, m12, m3 ; B0'
|
|
mova m3, m7
|
|
pmaddwd m16, m12, m0 ; A0
|
|
mova m0, m4
|
|
pmaddwd m18, m12, m1 ; A0'
|
|
mova m1, m5
|
|
vpdpwssd m17, m13, m6 ; B1
|
|
vpdpwssd m19, m13, m7 ; B1'
|
|
mova m6, m8
|
|
vpdpwssd m16, m13, m4 ; A1
|
|
pshufb m4, m22, m20
|
|
vpdpwssd m18, m13, m5 ; A1'
|
|
pshufb m7, m23, m20
|
|
vpdpwssd m6, m9, m4 ; f0
|
|
mova m4, m8
|
|
pshufb m5, m24, m21
|
|
vpdpwssd m4, m9, m7 ; g0
|
|
mova m7, m8
|
|
pshufb m26, m25, m21
|
|
vpdpwssd m7, m11, m5 ; f2'
|
|
mova m5, m8
|
|
pshufb m22, m21
|
|
vpdpwssd m5, m11, m26 ; g2'
|
|
pshufb m23, m21
|
|
vpdpwssd m6, m10, m22 ; f1
|
|
pshufb m24, m20
|
|
vpdpwssd m4, m10, m23 ; g1
|
|
pshufb m25, m20
|
|
vpdpwssd m7, m10, m24 ; f1'
|
|
shufpd m22, m24, 0x55
|
|
vpdpwssd m5, m10, m25 ; g1'
|
|
shufpd m23, m25, 0x55
|
|
vpdpwssd m6, m11, m22 ; f2
|
|
vpdpwssd m4, m11, m23 ; g2
|
|
vpdpwssd m7, m9, m22 ; f0'
|
|
vpdpwssd m5, m9, m23 ; g0'
|
|
vpermt2b m6, m27, m4 ; 56
|
|
vpermt2b m7, m27, m5 ; 56'
|
|
vpdpwssd m17, m14, m6 ; B2
|
|
vpshrdd m4, m2, m6, 16 ; 45
|
|
vpdpwssd m19, m14, m7 ; B2'
|
|
vpshrdd m5, m3, m7, 16 ; 45'
|
|
vpdpwssd m16, m14, m4 ; A2
|
|
vpdpwssd m18, m14, m5 ; A2'
|
|
REPX {psrad x, 10}, m17, m19, m16, m18
|
|
packusdw m17, m19
|
|
packusdw m16, m18
|
|
pminsw m17, m15
|
|
pminsw m16, m15
|
|
mova [r8+dsq*0], m16
|
|
mova [r8+dsq*1], m17
|
|
lea r8, [r8+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w32_loop
|
|
add srcq, 64
|
|
add dstq, 64
|
|
movzx hd, wb
|
|
sub wd, 1<<8
|
|
jg .hv_w32_loop0
|
|
RET
|
|
|
|
PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP, put_8tap_16bpc
|
|
PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH, put_8tap_16bpc
|
|
PUT_8TAP_FN regular_sharp, REGULAR, SHARP, put_8tap_16bpc
|
|
PUT_8TAP_FN sharp_regular, SHARP, REGULAR, put_8tap_16bpc
|
|
PUT_8TAP_FN sharp, SHARP, SHARP
|
|
|
|
cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
|
|
imul mxd, mxm, 0x010101
|
|
add mxd, t0d ; 8tap_h, mx, 4tap_h
|
|
imul myd, mym, 0x010101
|
|
add myd, t1d ; 8tap_v, my, 4tap_v
|
|
lea r8, [put_avx512icl]
|
|
movifnidn wd, wm
|
|
movifnidn hd, hm
|
|
test mxd, 0xf00
|
|
jnz .h
|
|
test myd, 0xf00
|
|
jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put
|
|
.v:
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 6
|
|
cmovs myd, mxd
|
|
vpbroadcastd m10, [pd_32]
|
|
pmovsxbw xmm0, [base+subpel_filters+myq*8]
|
|
tzcnt r7d, wd
|
|
vpbroadcastw m11, r8m
|
|
lea r6, [ssq*3]
|
|
movzx r7d, word [r8+r7*2+table_offset(put, _8tap_v)]
|
|
sub srcq, r6
|
|
mova [rsp+stack_offset+8], xmm0
|
|
vpbroadcastd m12, xmm0
|
|
add r7, r8
|
|
vpbroadcastd m13, [rsp+stack_offset+12]
|
|
vpbroadcastd m14, [rsp+stack_offset+16]
|
|
vpbroadcastd m15, [rsp+stack_offset+20]
|
|
jmp r7
|
|
.v_w2:
|
|
movd xmm2, [srcq+ssq*0]
|
|
pinsrd xmm2, [srcq+ssq*1], 1
|
|
pinsrd xmm2, [srcq+ssq*2], 2
|
|
add srcq, r6
|
|
pinsrd xmm2, [srcq+ssq*0], 3 ; 0 1 2 3
|
|
movd xmm3, [srcq+ssq*1]
|
|
vpbroadcastd xmm1, [srcq+ssq*2]
|
|
add srcq, r6
|
|
vpbroadcastd xmm0, [srcq+ssq*0]
|
|
vpblendd xmm3, xmm1, 0x02 ; 4 5
|
|
vpblendd xmm1, xmm0, 0x02 ; 5 6
|
|
palignr xmm4, xmm3, xmm2, 4 ; 1 2 3 4
|
|
punpcklwd xmm3, xmm1 ; 45 56
|
|
punpcklwd xmm1, xmm2, xmm4 ; 01 12
|
|
punpckhwd xmm2, xmm4 ; 23 34
|
|
.v_w2_loop:
|
|
vpbroadcastd xmm4, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
mova xmm5, xm10
|
|
vpdpwssd xmm5, xm12, xmm1 ; a0 b0
|
|
mova xmm1, xmm2
|
|
vpdpwssd xmm5, xm13, xmm2 ; a1 b1
|
|
mova xmm2, xmm3
|
|
vpdpwssd xmm5, xm14, xmm3 ; a2 b2
|
|
vpblendd xmm3, xmm0, xmm4, 0x02 ; 6 7
|
|
vpbroadcastd xmm0, [srcq+ssq*0]
|
|
vpblendd xmm4, xmm0, 0x02 ; 7 8
|
|
punpcklwd xmm3, xmm4 ; 67 78
|
|
vpdpwssd xmm5, xm15, xmm3 ; a3 b3
|
|
psrad xmm5, 6
|
|
packusdw xmm5, xmm5
|
|
pminsw xmm5, xm11
|
|
movd [dstq+dsq*0], xmm5
|
|
pextrd [dstq+dsq*1], xmm5, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w2_loop
|
|
RET
|
|
.v_w4:
|
|
movq xmm1, [srcq+ssq*0]
|
|
vpbroadcastq ymm0, [srcq+ssq*1]
|
|
vpbroadcastq ymm2, [srcq+ssq*2]
|
|
add srcq, r6
|
|
vpbroadcastq ymm4, [srcq+ssq*0]
|
|
vpbroadcastq ymm3, [srcq+ssq*1]
|
|
vpbroadcastq ymm5, [srcq+ssq*2]
|
|
add srcq, r6
|
|
vpblendd ymm1, ymm0, 0x30
|
|
vpblendd ymm0, ymm2, 0x30
|
|
punpcklwd ymm1, ymm0 ; 01 12
|
|
vpbroadcastq ymm0, [srcq+ssq*0]
|
|
vpblendd ymm2, ymm4, 0x30
|
|
vpblendd ymm4, ymm3, 0x30
|
|
punpcklwd ymm2, ymm4 ; 23 34
|
|
vpblendd ymm3, ymm5, 0x30
|
|
vpblendd ymm5, ymm0, 0x30
|
|
punpcklwd ymm3, ymm5 ; 45 56
|
|
.v_w4_loop:
|
|
vpbroadcastq ymm5, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
mova ymm4, ym10
|
|
vpdpwssd ymm4, ym12, ymm1 ; a0 b0
|
|
mova ymm1, ymm2
|
|
vpdpwssd ymm4, ym13, ymm2 ; a1 b1
|
|
mova ymm2, ymm3
|
|
vpdpwssd ymm4, ym14, ymm3 ; a2 b2
|
|
vpblendd ymm3, ymm0, ymm5, 0x30
|
|
vpbroadcastq ymm0, [srcq+ssq*0]
|
|
vpblendd ymm5, ymm0, 0x30
|
|
punpcklwd ymm3, ymm5 ; 67 78
|
|
vpdpwssd ymm4, ym15, ymm3 ; a3 b3
|
|
psrad ymm4, 6
|
|
vextracti128 xmm5, ymm4, 1
|
|
packusdw xmm4, xmm5
|
|
pminsw xmm4, xm11
|
|
movq [dstq+dsq*0], xmm4
|
|
movhps [dstq+dsq*1], xmm4
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w4_loop
|
|
vzeroupper
|
|
RET
|
|
.v_w8:
|
|
vbroadcasti32x4 m2, [srcq+ssq*2]
|
|
vinserti32x4 m1, m2, [srcq+ssq*0], 0
|
|
vinserti32x4 m1, [srcq+ssq*1], 1 ; 0 1 2
|
|
add srcq, r6
|
|
vinserti32x4 ym2, [srcq+ssq*0], 1
|
|
vinserti32x4 m2, [srcq+ssq*1], 2 ; 2 3 4
|
|
mova m6, [spel_v_shuf8]
|
|
movu xm0, [srcq+ssq*1]
|
|
vinserti32x4 ym0, [srcq+ssq*2], 1
|
|
add srcq, r6
|
|
vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
|
|
vpermb m1, m6, m1 ; 01 12
|
|
vpermb m2, m6, m2 ; 23 34
|
|
vpermb m3, m6, m0 ; 45 56
|
|
.v_w8_loop:
|
|
vinserti32x4 m0, [srcq+ssq*1], 3
|
|
lea srcq, [srcq+ssq*2]
|
|
movu xm5, [srcq+ssq*0]
|
|
mova m4, m10
|
|
vpdpwssd m4, m12, m1 ; a0 b0
|
|
mova m1, m2
|
|
vshufi32x4 m0, m5, q1032 ; 6 7 8
|
|
vpdpwssd m4, m13, m2 ; a1 b1
|
|
mova m2, m3
|
|
vpdpwssd m4, m14, m3 ; a2 b2
|
|
vpermb m3, m6, m0 ; 67 78
|
|
vpdpwssd m4, m15, m3 ; a3 b3
|
|
psrad m4, 6
|
|
vextracti32x8 ym5, m4, 1
|
|
packusdw ym4, ym5
|
|
pminsw ym4, ym11
|
|
mova [dstq+dsq*0], xm4
|
|
vextracti32x4 [dstq+dsq*1], ym4, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w8_loop
|
|
RET
|
|
.v_w16:
|
|
vbroadcasti32x8 m0, [srcq+ssq*1]
|
|
vinserti32x8 m1, m0, [srcq+ssq*2], 1
|
|
vinserti32x8 m0, [srcq+ssq*0], 0
|
|
mova m8, [spel_v_shuf16]
|
|
add srcq, r6
|
|
movu ym3, [srcq+ssq*0]
|
|
vinserti32x8 m3, [srcq+ssq*1], 1
|
|
movu ym5, [srcq+ssq*2]
|
|
add srcq, r6
|
|
vinserti32x8 m5, [srcq+ssq*0], 1
|
|
vpermb m1, m8, m1 ; 12
|
|
vpermb m0, m8, m0 ; 01
|
|
vpermb m3, m8, m3 ; 34
|
|
vpermb m5, m8, m5 ; 56
|
|
mova m9, [deint_q_shuf]
|
|
vpshrdd m2, m1, m3, 16 ; 23
|
|
vpshrdd m4, m3, m5, 16 ; 45
|
|
.v_w16_loop:
|
|
mova m7, m10
|
|
vpdpwssd m7, m12, m1 ; b0
|
|
mova m6, m10
|
|
vpdpwssd m6, m12, m0 ; a0
|
|
mova m1, m3
|
|
vpdpwssd m7, m13, m3 ; b1
|
|
mova m0, m2
|
|
vpdpwssd m6, m13, m2 ; a1
|
|
mova m3, m5
|
|
vpdpwssd m7, m14, m5 ; b2
|
|
mova m2, m4
|
|
vpdpwssd m6, m14, m4 ; a2
|
|
movu ym5, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m5, [srcq+ssq*0], 1
|
|
vpermb m5, m8, m5 ; 78
|
|
vpshrdd m4, m3, m5, 16 ; 67
|
|
vpdpwssd m7, m15, m5 ; b3
|
|
vpdpwssd m6, m15, m4 ; a3
|
|
psrad m7, 6
|
|
psrad m6, 6
|
|
packusdw m6, m7
|
|
pminsw m6, m11
|
|
vpermq m6, m9, m6
|
|
mova [dstq+dsq*0], ym6
|
|
vextracti32x8 [dstq+dsq*1], m6, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w16_loop
|
|
RET
|
|
.v_w32:
|
|
.v_w64:
|
|
.v_w128:
|
|
WIN64_SPILL_XMM 23
|
|
lea wd, [hq+wq*8-256]
|
|
.v_w32_loop0:
|
|
movu m16, [srcq+ssq*0]
|
|
movu m17, [srcq+ssq*1]
|
|
lea r7, [srcq+r6 ]
|
|
movu m18, [srcq+ssq*2]
|
|
movu m19, [r7 +ssq*0]
|
|
mov r8, dstq
|
|
movu m20, [r7 +ssq*1]
|
|
movu m21, [r7 +ssq*2]
|
|
add r7, r6
|
|
movu m22, [r7 +ssq*0]
|
|
punpcklwd m0, m16, m17 ; 01l
|
|
punpckhwd m16, m17 ; 01h
|
|
punpcklwd m1, m17, m18 ; 12l
|
|
punpckhwd m17, m18 ; 12h
|
|
punpcklwd m2, m18, m19 ; 23l
|
|
punpckhwd m18, m19 ; 23h
|
|
punpcklwd m3, m19, m20 ; 34l
|
|
punpckhwd m19, m20 ; 34h
|
|
punpcklwd m4, m20, m21 ; 45l
|
|
punpckhwd m20, m21 ; 45h
|
|
punpcklwd m5, m21, m22 ; 56l
|
|
punpckhwd m21, m22 ; 56h
|
|
.v_w32_loop:
|
|
mova m6, m10
|
|
vpdpwssd m6, m12, m0 ; a0l
|
|
mova m8, m10
|
|
vpdpwssd m8, m12, m16 ; a0h
|
|
mova m7, m10
|
|
vpdpwssd m7, m12, m1 ; b0l
|
|
mova m9, m10
|
|
vpdpwssd m9, m12, m17 ; b0h
|
|
mova m0, m2
|
|
vpdpwssd m6, m13, m2 ; a1l
|
|
mova m16, m18
|
|
vpdpwssd m8, m13, m18 ; a1h
|
|
mova m1, m3
|
|
vpdpwssd m7, m13, m3 ; b1l
|
|
mova m17, m19
|
|
vpdpwssd m9, m13, m19 ; b1h
|
|
mova m2, m4
|
|
vpdpwssd m6, m14, m4 ; a2l
|
|
mova m18, m20
|
|
vpdpwssd m8, m14, m20 ; a2h
|
|
mova m3, m5
|
|
vpdpwssd m7, m14, m5 ; b2l
|
|
mova m19, m21
|
|
vpdpwssd m9, m14, m21 ; b2h
|
|
movu m21, [r7+ssq*1]
|
|
lea r7, [r7+ssq*2]
|
|
punpcklwd m4, m22, m21 ; 67l
|
|
punpckhwd m20, m22, m21 ; 67h
|
|
movu m22, [r7+ssq*0]
|
|
vpdpwssd m6, m15, m4 ; a3l
|
|
vpdpwssd m8, m15, m20 ; a3h
|
|
punpcklwd m5, m21, m22 ; 78l
|
|
punpckhwd m21, m22 ; 78h
|
|
vpdpwssd m7, m15, m5 ; b3l
|
|
vpdpwssd m9, m15, m21 ; b3h
|
|
REPX {psrad x, 6}, m6, m8, m7, m9
|
|
packusdw m6, m8
|
|
packusdw m7, m9
|
|
pminsw m6, m11
|
|
pminsw m7, m11
|
|
mova [r8+dsq*0], m6
|
|
mova [r8+dsq*1], m7
|
|
lea r8, [r8+dsq*2]
|
|
sub hd, 2
|
|
jg .v_w32_loop
|
|
add srcq, 64
|
|
add dstq, 64
|
|
movzx hd, wb
|
|
sub wd, 1<<8
|
|
jg .v_w32_loop0
|
|
RET
|
|
.h_w2:
|
|
RESET_STACK_STATE
|
|
mova ym2, [spel_h_shuf2a]
|
|
sub srcq, 2
|
|
pshufd xmm3, xmm0, q1111
|
|
pshufd xmm4, xmm0, q2222
|
|
.h_w2_loop:
|
|
movu xm1, [srcq+ssq*0]
|
|
vinserti32x4 ym1, [srcq+ssq*1], 1
|
|
lea srcq, [srcq+ssq*2]
|
|
mova xmm0, xm8
|
|
vpermb ym1, ym2, ym1
|
|
vpdpwssd xmm0, xmm3, xm1
|
|
vextracti32x4 xm1, ym1, 1
|
|
vpdpwssd xmm0, xmm4, xm1
|
|
psrad xmm0, 6
|
|
packusdw xmm0, xmm0
|
|
pminsw xmm0, xm15
|
|
movd [dstq+dsq*0], xmm0
|
|
pextrd [dstq+dsq*1], xmm0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .h_w2_loop
|
|
RET
|
|
.h_w4:
|
|
movzx mxd, mxb
|
|
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
|
|
jl .h_w2
|
|
vbroadcasti32x4 ym4, [spel_h_shufA]
|
|
vbroadcasti32x4 ym5, [spel_h_shufB]
|
|
sub srcq, 2
|
|
pshufd xmm0, xmm0, q2211
|
|
vpbroadcastq ym6, xmm0
|
|
vpermq ym7, ymm0, q1111
|
|
.h_w4_loop:
|
|
movu xm2, [srcq+ssq*0]
|
|
vinserti32x4 ym2, [srcq+ssq*1], 1
|
|
lea srcq, [srcq+ssq*2]
|
|
mova ym0, ym8
|
|
pshufb ym1, ym2, ym4
|
|
vpdpwssd ym0, ym6, ym1
|
|
pshufb ym2, ym5
|
|
vpdpwssd ym0, ym7, ym2
|
|
psrad ym0, 6
|
|
vextracti32x4 xm1, ym0, 1
|
|
packusdw xm0, xm1
|
|
pminsw xmm0, xm0, xm15
|
|
movq [dstq+dsq*0], xmm0
|
|
movhps [dstq+dsq*1], xmm0
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .h_w4_loop
|
|
RET
|
|
.h_w8:
|
|
mova m4, [spel_h_shufA]
|
|
movu m5, [spel_h_shufB]
|
|
movu m6, [spel_h_shufC]
|
|
mova m7, [spel_h_shufD]
|
|
.h_w8_loop:
|
|
movu ym2, [srcq+ssq*0]
|
|
vinserti32x8 m2, [srcq+ssq*1], 1
|
|
lea srcq, [srcq+ssq*2]
|
|
mova m0, m8
|
|
vpermb m1, m4, m2
|
|
vpdpwssd m0, m10, m1
|
|
vpermb m1, m5, m2
|
|
vpdpwssd m0, m11, m1
|
|
vpermb m1, m6, m2
|
|
vpdpwssd m0, m12, m1
|
|
vpermb m1, m7, m2
|
|
vpdpwssd m0, m13, m1
|
|
psrad m0, 6
|
|
vextracti32x8 ym1, m0, 1
|
|
packusdw ym0, ym1
|
|
pminsw ym0, ym15
|
|
mova [dstq+dsq*0], xm0
|
|
vextracti32x4 [dstq+dsq*1], ym0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .h_w8_loop
|
|
RET
|
|
.h:
|
|
vpbroadcastw m15, r8m
|
|
test myd, 0xf00
|
|
jnz .hv
|
|
mov r7d, r8m
|
|
shr r7d, 11
|
|
vpbroadcastd m8, [base+put_8tap_h_rnd+r7*4]
|
|
cmp wd, 4
|
|
jle .h_w4
|
|
shr mxd, 16
|
|
sub srcq, 6
|
|
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
|
|
mova [buf], xmm0
|
|
vpbroadcastd m10, xmm0
|
|
vpbroadcastd m11, [buf+ 4]
|
|
vpbroadcastd m12, [buf+ 8]
|
|
vpbroadcastd m13, [buf+12]
|
|
sub wd, 16
|
|
jl .h_w8
|
|
vbroadcasti32x4 m6, [spel_h_shufA]
|
|
vbroadcasti32x4 m7, [spel_h_shufB]
|
|
jg .h_w32
|
|
.h_w16_loop:
|
|
movu ym2, [srcq+ssq*0+ 0]
|
|
vinserti32x8 m2, [srcq+ssq*1+ 0], 1
|
|
movu ym3, [srcq+ssq*0+16]
|
|
vinserti32x8 m3, [srcq+ssq*1+16], 1
|
|
lea srcq, [srcq+ssq*2]
|
|
mova m0, m8
|
|
mova m1, m8
|
|
pshufb m4, m2, m6
|
|
vpdpwssd m0, m10, m4 ; a0
|
|
pshufb m4, m3, m6
|
|
vpdpwssd m1, m12, m4 ; b2
|
|
pshufb m4, m2, m7
|
|
vpdpwssd m0, m11, m4 ; a1
|
|
pshufb m4, m3, m7
|
|
vpdpwssd m1, m13, m4 ; b3
|
|
shufpd m2, m3, 0x55
|
|
pshufb m4, m2, m6
|
|
vpdpwssd m0, m12, m4 ; a2
|
|
vpdpwssd m1, m10, m4 ; b0
|
|
pshufb m2, m7
|
|
vpdpwssd m0, m13, m2 ; a3
|
|
vpdpwssd m1, m11, m2 ; b1
|
|
psrad m0, 6
|
|
psrad m1, 6
|
|
packusdw m0, m1
|
|
pminsw m0, m15
|
|
mova [dstq+dsq*0], ym0
|
|
vextracti32x8 [dstq+dsq*1], m0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .h_w16_loop
|
|
RET
|
|
.h_w32:
|
|
lea srcq, [srcq+wq*2]
|
|
lea dstq, [dstq+wq*2]
|
|
neg wq
|
|
.h_w32_loop0:
|
|
mov r6, wq
|
|
.h_w32_loop:
|
|
movu m2, [srcq+r6*2+ 0]
|
|
movu m3, [srcq+r6*2+ 8]
|
|
mova m0, m8
|
|
mova m1, m8
|
|
pshufb m4, m2, m6
|
|
vpdpwssd m0, m10, m4 ; a0
|
|
pshufb m4, m3, m6
|
|
vpdpwssd m1, m10, m4 ; b0
|
|
vpdpwssd m0, m12, m4 ; a2
|
|
movu m4, [srcq+r6*2+16]
|
|
pshufb m3, m7
|
|
vpdpwssd m1, m11, m3 ; b1
|
|
vpdpwssd m0, m13, m3 ; a3
|
|
pshufb m3, m4, m6
|
|
vpdpwssd m1, m12, m3 ; b2
|
|
pshufb m2, m7
|
|
vpdpwssd m0, m11, m2 ; a1
|
|
pshufb m4, m7
|
|
vpdpwssd m1, m13, m4 ; b3
|
|
psrad m0, 6
|
|
psrad m1, 6
|
|
packusdw m0, m1
|
|
pminsw m0, m15
|
|
mova [dstq+r6*2], m0
|
|
add r6, 32
|
|
jl .h_w32_loop
|
|
add srcq, ssq
|
|
add dstq, dsq
|
|
dec hd
|
|
jg .h_w32_loop0
|
|
RET
|
|
.hv:
|
|
cmp wd, 4
|
|
jg .hv_w8
|
|
movzx mxd, mxb
|
|
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 6
|
|
cmovs myd, mxd
|
|
pmovsxbw xmm1, [base+subpel_filters+myq*8]
|
|
lea r6, [ssq*3]
|
|
sub srcq, 2
|
|
sub srcq, r6
|
|
test dword r8m, 0x800
|
|
jnz .hv_12bit
|
|
vpbroadcastd m10, [pd_2176]
|
|
psllw xmm0, 6
|
|
jmp .hv_main
|
|
.hv_12bit:
|
|
vpbroadcastd m10, [pd_640]
|
|
psllw xmm0, 4
|
|
psllw xmm1, 2
|
|
.hv_main:
|
|
mova [buf+ 0], xmm0
|
|
mova [buf+16], xmm1
|
|
vpbroadcastd m8, [buf+ 4]
|
|
vpbroadcastd m9, [buf+ 8]
|
|
vpbroadcastd ym11, xmm1
|
|
vpbroadcastd ym12, [buf+20]
|
|
vpbroadcastd ym13, [buf+24]
|
|
vpbroadcastd ym14, [buf+28]
|
|
movu xm4, [srcq+ssq*0]
|
|
vinserti32x4 ym4, [srcq+ssq*1], 1
|
|
vinserti32x4 m4, [srcq+ssq*2], 2
|
|
add srcq, r6
|
|
vinserti32x4 m4, [srcq+ssq*0], 3 ; 0 1 2 3
|
|
movu xm0, [srcq+ssq*1]
|
|
vinserti32x4 ym0, [srcq+ssq*2], 1
|
|
add srcq, r6
|
|
vinserti32x4 m0, [srcq+ssq*0], 2 ; 4 5 6
|
|
cmp wd, 4
|
|
je .hv_w4
|
|
vbroadcasti32x4 m2, [spel_h_shufA]
|
|
mova m3, [spel_h_shuf2b]
|
|
mova ym6, [spel_h_shuf2a]
|
|
mova xm7, [spel_shuf2]
|
|
mova m1, m10
|
|
pshufb m4, m2
|
|
pshufb m0, m2
|
|
punpcklqdq m2, m4, m0
|
|
vpdpwssd m1, m8, m2 ; 04 15 26 3_
|
|
punpckhqdq m4, m0
|
|
vpdpwssd m1, m9, m4
|
|
vpermb m1, m3, m1 ; 01 12
|
|
vextracti32x4 xm2, ym1, 1 ; 23 34
|
|
vextracti32x4 xm3, m1, 2 ; 45 56
|
|
.hv_w2_loop:
|
|
movu xm5, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x4 ym5, [srcq+ssq*0], 1
|
|
mova xm4, xm10
|
|
vpermb ym5, ym6, ym5
|
|
pmaddwd xmm0, xm11, xm1 ; a0 b0
|
|
vpdpwssd xm4, xm8, xm5
|
|
vextracti32x4 xm5, ym5, 1
|
|
mova xm1, xm2
|
|
vpdpwssd xmm0, xm12, xm2 ; a1 b1
|
|
vpdpwssd xm4, xm9, xm5 ; 7 8
|
|
mova xm2, xm3
|
|
vpdpwssd xmm0, xm13, xm3 ; a2 b2
|
|
vpermt2b xm3, xm7, xm4 ; 67 78
|
|
vpdpwssd xmm0, xm14, xm3 ; a3 b3
|
|
psrad xmm0, 10
|
|
packusdw xmm0, xmm0
|
|
pminsw xmm0, xm15
|
|
movd [dstq+dsq*0], xmm0
|
|
pextrd [dstq+dsq*1], xmm0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w2_loop
|
|
RET
|
|
.hv_w4:
|
|
vbroadcasti32x4 m19, [spel_h_shufA]
|
|
vbroadcasti32x4 m20, [spel_h_shufB]
|
|
mova ym6, [spel_shuf4a]
|
|
mova ym7, [spel_shuf4b]
|
|
mova m2, m10
|
|
mova m3, m10
|
|
pshufb m1, m4, m19
|
|
vpdpwssd m2, m8, m1
|
|
pshufb m1, m0, m19
|
|
vpdpwssd m3, m8, m1
|
|
pshufb m4, m20
|
|
vpdpwssd m2, m9, m4
|
|
pshufb m0, m20
|
|
vpdpwssd m3, m9, m0
|
|
vpermb m1, m6, m2 ; 01 12
|
|
vshufi32x4 m2, m3, q1032
|
|
vpermb m3, m6, m3 ; 45 56
|
|
vpermb m2, m6, m2 ; 23 34
|
|
.hv_w4_loop:
|
|
movu xm18, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti128 ym18, [srcq+ssq*0], 1
|
|
pmaddwd ym16, ym11, ym1 ; a0 b0
|
|
mova ym1, ym2
|
|
mova ym2, ym3
|
|
pshufb ym17, ym18, ym19
|
|
mova ym3, ym10
|
|
vpdpwssd ym3, ym8, ym17
|
|
pshufb ym18, ym20
|
|
vpdpwssd ym16, ym12, ym1 ; a1 b1
|
|
vpdpwssd ym3, ym9, ym18 ; 7 8
|
|
vpdpwssd ym16, ym13, ym2 ; a2 b2
|
|
vpermt2b ym3, ym7, ym2 ; 67 78
|
|
vpdpwssd ym16, ym14, ym3 ; a3 b3
|
|
psrad ym16, 10
|
|
vextracti128 xm17, ym16, 1
|
|
packusdw xm16, xm17
|
|
pminsw xm16, xm15
|
|
movq [dstq+dsq*0], xm16
|
|
movhps [dstq+dsq*1], xm16
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w4_loop
|
|
vzeroupper
|
|
RET
|
|
.hv_w8:
|
|
shr mxd, 16
|
|
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 6
|
|
cmovs myd, mxd
|
|
pmovsxbw xmm1, [base+subpel_filters+myq*8]
|
|
lea r6, [ssq*3]
|
|
sub srcq, 6
|
|
sub srcq, r6
|
|
test dword r8m, 0x800
|
|
jnz .hv_w8_12bit
|
|
vpbroadcastd m10, [pd_2176]
|
|
psllw xmm0, 6
|
|
jmp .hv_w8_main
|
|
.hv_w8_12bit:
|
|
vpbroadcastd m10, [pd_640]
|
|
psllw xmm0, 4
|
|
psllw xmm1, 2
|
|
.hv_w8_main:
|
|
mova [buf+ 0], xmm0
|
|
mova [buf+16], xmm1
|
|
vpbroadcastd m11, xmm0
|
|
vpbroadcastd m12, [buf+ 4]
|
|
vpbroadcastd m13, [buf+ 8]
|
|
vpbroadcastd m14, [buf+12]
|
|
vpbroadcastd m16, xmm1
|
|
vpbroadcastd m17, [buf+20]
|
|
vpbroadcastd m18, [buf+24]
|
|
vpbroadcastd m19, [buf+28]
|
|
cmp wd, 8
|
|
jg .hv_w16
|
|
mova m5, [spel_h_shufA]
|
|
movu ym0, [srcq+ssq*0]
|
|
vinserti32x8 m0, [srcq+ssq*1], 1 ; 0 1
|
|
movu ym9, [srcq+ssq*2]
|
|
add srcq, r6
|
|
vinserti32x8 m9, [srcq+ssq*0], 1 ; 2 3
|
|
movu ym20, [srcq+ssq*1]
|
|
vinserti32x8 m20, [srcq+ssq*2], 1 ; 4 5
|
|
add srcq, r6
|
|
movu ym21, [srcq+ssq*0] ; 6
|
|
movu m6, [spel_h_shufB]
|
|
movu m7, [spel_h_shufC]
|
|
vpermb m8, m5, m0
|
|
mova m1, m10
|
|
vpdpwssd m1, m11, m8 ; a0 b0
|
|
vpermb m8, m5, m9
|
|
mova m2, m10
|
|
vpdpwssd m2, m11, m8 ; c0 d0
|
|
vpermb m8, m5, m20
|
|
mova m3, m10
|
|
vpdpwssd m3, m11, m8 ; e0 f0
|
|
vpermb m8, m5, m21
|
|
mova m4, m10
|
|
vpdpwssd m4, m11, m8 ; g0
|
|
vpermb m8, m6, m0
|
|
vpdpwssd m1, m12, m8 ; a1 b1
|
|
vpermb m8, m6, m9
|
|
vpdpwssd m2, m12, m8 ; c1 d1
|
|
vpermb m8, m6, m20
|
|
vpdpwssd m3, m12, m8 ; e1 f1
|
|
vpermb m8, m6, m21
|
|
vpdpwssd m4, m12, m8 ; g1
|
|
vpermb m8, m7, m0
|
|
vpdpwssd m1, m13, m8 ; a2 b2
|
|
vpermb m8, m7, m9
|
|
vpdpwssd m2, m13, m8 ; c2 d2
|
|
vpermb m8, m7, m20
|
|
vpdpwssd m3, m13, m8 ; e2 f2
|
|
vpermb m8, m7, m21
|
|
vpdpwssd m4, m13, m8 ; g2
|
|
mova m8, [spel_h_shufD]
|
|
vpermb m0, m8, m0
|
|
vpdpwssd m1, m14, m0 ; a3 b3
|
|
mova m0, [spel_shuf8a]
|
|
vpermb m9, m8, m9
|
|
vpdpwssd m2, m14, m9 ; c3 d3
|
|
mova m9, [spel_shuf8b]
|
|
vpermb m20, m8, m20
|
|
vpdpwssd m3, m14, m20 ; e3 f3
|
|
vpermb m21, m8, m21
|
|
vpdpwssd m4, m14, m21 ; g3
|
|
vpermt2b m1, m0, m2 ; 01 12
|
|
vpermt2b m2, m0, m3 ; 23 34
|
|
vpermt2b m3, m0, m4 ; 45 56
|
|
.hv_w8_loop:
|
|
movu ym0, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m0, [srcq+ssq*0], 1
|
|
mova m4, m10
|
|
vpermb m21, m5, m0
|
|
vpdpwssd m4, m11, m21 ; h0 i0
|
|
vpermb m21, m6, m0
|
|
pmaddwd m20, m16, m1 ; A0 B0
|
|
vpdpwssd m4, m12, m21 ; h1 i1
|
|
vpermb m21, m7, m0
|
|
mova m1, m2
|
|
vpdpwssd m20, m17, m2 ; A1 B1
|
|
vpdpwssd m4, m13, m21 ; h2 i2
|
|
vpermb m21, m8, m0
|
|
mova m2, m3
|
|
vpdpwssd m20, m18, m3 ; A2 B2
|
|
vpdpwssd m4, m14, m21 ; h3 i3
|
|
vpermt2b m3, m9, m4 ; 67 78
|
|
vpdpwssd m20, m19, m3 ; A3 B3
|
|
psrad m20, 10
|
|
vextracti32x8 ym21, m20, 1
|
|
packusdw ym20, ym21
|
|
pminsw ym20, ym15
|
|
mova [dstq+dsq*0], xm20
|
|
vextracti128 [dstq+dsq*1], ym20, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w8_loop
|
|
vzeroupper
|
|
RET
|
|
.hv_w16:
|
|
WIN64_SPILL_XMM 26
|
|
vbroadcasti32x4 m20, [spel_h_shufA]
|
|
vbroadcasti32x4 m21, [spel_h_shufB]
|
|
add wd, wd
|
|
mova m9, [spel_shuf16]
|
|
lea wd, [hq+wq*8-256]
|
|
.hv_w16_loop0:
|
|
vbroadcasti32x8 m5, [srcq+ssq*0+ 8]
|
|
vinserti32x8 m4, m5, [srcq+ssq*0+ 0], 0
|
|
vinserti32x8 m5, [srcq+ssq*0+16], 1 ; 0
|
|
movu ym6, [srcq+ssq*1+ 0]
|
|
movu ym7, [srcq+ssq*1+16]
|
|
lea r7, [srcq+r6]
|
|
vinserti32x8 m6, [srcq+ssq*2+ 0], 1
|
|
vinserti32x8 m7, [srcq+ssq*2+16], 1 ; 1 2
|
|
movu ym22, [r7 +ssq*0+ 0]
|
|
movu ym23, [r7 +ssq*0+16]
|
|
mov r8, dstq
|
|
vinserti32x8 m22, [r7 +ssq*1+ 0], 1
|
|
vinserti32x8 m23, [r7 +ssq*1+16], 1 ; 3 4
|
|
movu ym24, [r7 +ssq*2+ 0]
|
|
movu ym25, [r7 +ssq*2+16]
|
|
add r7, r6
|
|
vinserti32x8 m24, [r7 +ssq*0+ 0], 1
|
|
vinserti32x8 m25, [r7 +ssq*0+16], 1 ; 5 6
|
|
pshufb m0, m4, m20
|
|
mova m1, m10
|
|
vpdpwssd m1, m11, m0 ; a0
|
|
pshufb m0, m6, m20
|
|
mova m2, m10
|
|
vpdpwssd m2, m11, m0 ; b0
|
|
pshufb m0, m7, m20
|
|
mova m3, m10
|
|
vpdpwssd m3, m13, m0 ; c2
|
|
pshufb m0, m4, m21
|
|
vpdpwssd m1, m12, m0 ; a1
|
|
pshufb m0, m6, m21
|
|
vpdpwssd m2, m12, m0 ; b1
|
|
pshufb m0, m7, m21
|
|
vpdpwssd m3, m14, m0 ; c3
|
|
pshufb m0, m5, m20
|
|
vpdpwssd m1, m13, m0 ; a2
|
|
shufpd m6, m7, 0x55
|
|
pshufb m7, m6, m20
|
|
vpdpwssd m2, m13, m7 ; b2
|
|
vpdpwssd m3, m11, m7 ; c0
|
|
pshufb m5, m21
|
|
vpdpwssd m1, m14, m5 ; a3
|
|
pshufb m6, m21
|
|
vpdpwssd m2, m14, m6 ; b3
|
|
vpdpwssd m3, m12, m6 ; c1
|
|
pshufb m0, m22, m20
|
|
mova m4, m10
|
|
vpdpwssd m4, m11, m0 ; d0
|
|
pshufb m0, m23, m20
|
|
mova m5, m10
|
|
vpdpwssd m5, m13, m0 ; e2
|
|
pshufb m0, m24, m20
|
|
mova m6, m10
|
|
vpdpwssd m6, m11, m0 ; f0
|
|
pshufb m0, m25, m20
|
|
mova m7, m10
|
|
vpdpwssd m7, m13, m0 ; g2
|
|
pshufb m0, m22, m21
|
|
vpdpwssd m4, m12, m0 ; d1
|
|
pshufb m0, m23, m21
|
|
vpdpwssd m5, m14, m0 ; e3
|
|
pshufb m0, m24, m21
|
|
vpdpwssd m6, m12, m0 ; f1
|
|
pshufb m0, m25, m21
|
|
vpdpwssd m7, m14, m0 ; g3
|
|
shufpd m22, m23, 0x55
|
|
pshufb m23, m22, m20
|
|
vpdpwssd m4, m13, m23 ; d2
|
|
vpdpwssd m5, m11, m23 ; e0
|
|
shufpd m24, m25, 0x55
|
|
pshufb m25, m24, m20
|
|
vpdpwssd m6, m13, m25 ; f2
|
|
vpdpwssd m7, m11, m25 ; g0
|
|
pshufb m22, m21
|
|
vpdpwssd m4, m14, m22 ; d3
|
|
vpdpwssd m5, m12, m22 ; e1
|
|
pshufb m24, m21
|
|
vpdpwssd m6, m14, m24 ; f3
|
|
vpdpwssd m7, m12, m24 ; g1
|
|
pslldq m1, 1
|
|
vpermt2b m2, m9, m3 ; 12
|
|
vpermt2b m4, m9, m5 ; 34
|
|
vpermt2b m6, m9, m7 ; 56
|
|
vpshrdd m1, m2, 16 ; 01
|
|
vpshrdd m3, m2, m4, 16 ; 23
|
|
vpshrdd m5, m4, m6, 16 ; 45
|
|
.hv_w16_loop:
|
|
movu ym24, [r7+ssq*1+ 0]
|
|
movu ym25, [r7+ssq*1+16]
|
|
lea r7, [r7+ssq*2]
|
|
vinserti32x8 m24, [r7+ssq*0+ 0], 1
|
|
vinserti32x8 m25, [r7+ssq*0+16], 1
|
|
mova m7, m10
|
|
mova m8, m10
|
|
pshufb m0, m24, m20
|
|
vpdpwssd m7, m11, m0 ; h0
|
|
pshufb m0, m25, m20
|
|
vpdpwssd m8, m13, m0 ; i2
|
|
pmaddwd m22, m16, m1 ; A0
|
|
mova m1, m3
|
|
pmaddwd m23, m16, m2 ; B0
|
|
mova m2, m4
|
|
pshufb m0, m24, m21
|
|
vpdpwssd m7, m12, m0 ; h1
|
|
pshufb m0, m25, m21
|
|
vpdpwssd m8, m14, m0 ; i3
|
|
vpdpwssd m22, m17, m3 ; A1
|
|
mova m3, m5
|
|
vpdpwssd m23, m17, m4 ; B1
|
|
mova m4, m6
|
|
shufpd m24, m25, 0x55
|
|
pshufb m25, m24, m20
|
|
vpdpwssd m7, m13, m25 ; h2
|
|
vpdpwssd m8, m11, m25 ; i0
|
|
vpdpwssd m22, m18, m5 ; A2
|
|
vpdpwssd m23, m18, m6 ; B2
|
|
pshufb m24, m21
|
|
vpdpwssd m7, m14, m24 ; h3
|
|
vpdpwssd m8, m12, m24 ; i1
|
|
vpermt2b m7, m9, m8 ; 78
|
|
vpshrdd m5, m6, m7, 16 ; 67
|
|
vpdpwssd m22, m19, m5 ; A3
|
|
vpdpwssd m23, m19, m7 ; B3
|
|
mova m6, m7
|
|
psrad m22, 10
|
|
psrad m23, 10
|
|
vshufi32x4 m0, m22, m23, q3232
|
|
vinserti32x8 m22, ym23, 1
|
|
packusdw m22, m0
|
|
pminsw m22, m15
|
|
mova [r8+dsq*0], ym22
|
|
vextracti32x8 [r8+dsq*1], m22, 1
|
|
lea r8, [r8+dsq*2]
|
|
sub hd, 2
|
|
jg .hv_w16_loop
|
|
add srcq, 32
|
|
add dstq, 32
|
|
movzx hd, wb
|
|
sub wd, 1<<8
|
|
jg .hv_w16_loop0
|
|
RET
|
|
|
|
%if WIN64
|
|
DECLARE_REG_TMP 6, 4
|
|
%else
|
|
DECLARE_REG_TMP 6, 7
|
|
%endif
|
|
|
|
%define PREP_8TAP_FN FN prep_8tap,
|
|
PREP_8TAP_FN smooth, SMOOTH, SMOOTH, prep_6tap_16bpc
|
|
PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR, prep_6tap_16bpc
|
|
PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH, prep_6tap_16bpc
|
|
PREP_8TAP_FN regular, REGULAR, REGULAR
|
|
|
|
cglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my
|
|
%define base r7-prep_avx512icl
|
|
imul mxd, mxm, 0x010101
|
|
add mxd, t0d ; 6tap_h, mx, 4tap_h
|
|
imul myd, mym, 0x010101
|
|
add myd, t1d ; 6tap_v, my, 4tap_v
|
|
lea r7, [prep_avx512icl]
|
|
mov wd, wm
|
|
movifnidn hd, hm
|
|
test mxd, 0xf00
|
|
jnz .h
|
|
test myd, 0xf00
|
|
jnz .v
|
|
.prep:
|
|
tzcnt wd, wd
|
|
mov r5d, r7m ; bitdepth_max
|
|
vpbroadcastd m5, [pw_8192]
|
|
movzx wd, word [r7+wq*2+table_offset(prep,)]
|
|
shr r5d, 11
|
|
vpbroadcastd m4, [r7-prep_avx512icl+prep_mul+r5*4]
|
|
add wq, r7
|
|
lea r6, [ssq*3]
|
|
%if WIN64
|
|
pop r7
|
|
%endif
|
|
jmp wq
|
|
.h_w8:
|
|
mova m6, [spel_h_shufA]
|
|
movu m7, [spel_h_shufC]
|
|
mova m8, [prep_endB]
|
|
.h_w8_loop:
|
|
movu ym4, [srcq+ssq*0]
|
|
vinserti32x8 m4, [srcq+ssq*1], 1
|
|
movu ym5, [srcq+ssq*2]
|
|
vinserti32x8 m5, [srcq+r6 ], 1
|
|
lea srcq, [srcq+ssq*4]
|
|
mova m0, m10
|
|
mova m1, m10
|
|
vpermb m2, m6, m4
|
|
vpermb m3, m6, m5
|
|
vpdpwssd m0, m12, m2 ; a0 b0
|
|
vpdpwssd m1, m12, m3 ; c0 d0
|
|
vpermb m4, m7, m4
|
|
vpermb m5, m7, m5
|
|
vpdpwssd m0, m14, m4 ; a2 b2
|
|
vpdpwssd m1, m14, m5 ; c2 d2
|
|
shufpd m2, m4, 0x55
|
|
shufpd m3, m5, 0x55
|
|
vpdpwssd m0, m13, m2 ; a1 b1
|
|
vpdpwssd m1, m13, m3 ; c1 d1
|
|
vpermt2b m0, m8, m1
|
|
mova [tmpq], m0
|
|
add tmpq, 64
|
|
sub hd, 4
|
|
jg .h_w8_loop
|
|
RET
|
|
.h:
|
|
vpbroadcastd m10, [prep_8tap_rnd]
|
|
test myd, 0xf00
|
|
jnz .hv
|
|
lea r6, [ssq*3]
|
|
cmp wd, 4
|
|
je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4
|
|
shr mxd, 16
|
|
pmovsxbw xmm0, [base+subpel_filters+1+mxq*8]
|
|
mov r5d, r7m
|
|
sub srcq, 4
|
|
shr r5d, 11
|
|
psllw xmm0, [base+prep_hv_shift+r5*8]
|
|
mova [tmpq], xmm0
|
|
vpbroadcastd m12, xmm0
|
|
vpbroadcastd m13, [tmpq+ 4]
|
|
vpbroadcastd m14, [tmpq+ 8]
|
|
cmp wd, 16
|
|
jl .h_w8
|
|
vbroadcasti32x4 m5, [spel_h_shufA]
|
|
vbroadcasti32x4 m6, [spel_h_shufB]
|
|
mova m7, [prep_endC]
|
|
jg .h_w32
|
|
.h_w16_loop:
|
|
movu ym2, [srcq+ssq*0+ 0]
|
|
vinserti32x8 m2, [srcq+ssq*1+ 0], 1
|
|
movu ym3, [srcq+ssq*0+12]
|
|
vinserti32x8 m3, [srcq+ssq*1+12], 1
|
|
lea srcq, [srcq+ssq*2]
|
|
mova m0, m10
|
|
mova m1, m10
|
|
pshufb m4, m2, m5 ; 01
|
|
vpdpwssd m0, m12, m4 ; a0 b0
|
|
pshufb m4, m3, m6 ; 89
|
|
vpdpwssd m1, m14, m4 ; a2' b2'
|
|
pshufb m2, m6 ; 23
|
|
pshufb m3, m5 ; 67
|
|
vpdpwssd m0, m13, m2 ; a1 b1
|
|
vpdpwssd m1, m13, m3 ; a1' b1'
|
|
shufpd m2, m3, 0x55 ; 45
|
|
vpdpwssd m0, m14, m2 ; a2 b2
|
|
vpdpwssd m1, m12, m2 ; a0' b0'
|
|
vpermt2b m0, m7, m1
|
|
mova [tmpq], m0
|
|
add tmpq, 64
|
|
sub hd, 2
|
|
jg .h_w16_loop
|
|
RET
|
|
.h_w32:
|
|
lea srcq, [srcq+wq*2]
|
|
neg wq
|
|
.h_w32_loop0:
|
|
mov r6, wq
|
|
.h_w32_loop:
|
|
movu m2, [srcq+r6*2+ 0]
|
|
movu m3, [srcq+r6*2+12]
|
|
mova m0, m10
|
|
mova m1, m10
|
|
pshufb m4, m2, m5
|
|
vpdpwssd m0, m12, m4
|
|
pshufb m4, m3, m6
|
|
vpdpwssd m1, m14, m4
|
|
pshufb m2, m6
|
|
pshufb m3, m5
|
|
vpdpwssd m0, m13, m2
|
|
vpdpwssd m1, m13, m3
|
|
shufpd m2, m3, 0x55
|
|
vpdpwssd m0, m14, m2
|
|
vpdpwssd m1, m12, m2
|
|
vpermt2b m0, m7, m1
|
|
mova [tmpq], m0
|
|
add tmpq, 64
|
|
add r6, 32
|
|
jl .h_w32_loop
|
|
add srcq, ssq
|
|
dec hd
|
|
jg .h_w32_loop0
|
|
RET
|
|
.v:
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 4
|
|
cmove myd, mxd
|
|
mov r5d, r7m
|
|
vpbroadcastd m10, [prep_8tap_rnd]
|
|
pmovsxbw xmm0, [base+subpel_filters+1+myq*8]
|
|
tzcnt r6d, wd
|
|
shr r5d, 11
|
|
movzx r6d, word [r7+r6*2+table_offset(prep, _6tap_v)]
|
|
psllw xmm0, [base+prep_hv_shift+r5*8]
|
|
add r7, r6
|
|
mova [tmpq], xmm0
|
|
vpbroadcastd m12, xmm0
|
|
mov r6, ssq
|
|
vpbroadcastd m13, [tmpq+ 4]
|
|
neg r6
|
|
vpbroadcastd m14, [tmpq+ 8]
|
|
jmp r7
|
|
.v_w4:
|
|
mov r3d, 0x330c
|
|
movq xm1, [srcq+r6 *2]
|
|
kmovw k1, r3d
|
|
vpbroadcastq ym1{k1}, [srcq+r6 *1]
|
|
vpbroadcastq m2, [srcq+ssq*0]
|
|
vinserti32x4 m1{k1}, m2, [srcq+ssq*1], 3
|
|
movq xm0, [srcq+ssq*2]
|
|
mova ym4, [prep_endA]
|
|
valignq m0, m1, 2
|
|
punpcklwd m1, m0 ; 01 12 23 34
|
|
.v_w4_loop:
|
|
lea srcq, [srcq+ssq*4]
|
|
movq xm2, [srcq+r6 *1]
|
|
vpbroadcastq ym2{k1}, [srcq+ssq*0]
|
|
vpbroadcastq m3, [srcq+ssq*1]
|
|
vinserti32x4 m2{k1}, m3, [srcq+ssq*2], 3
|
|
mova m3, m10
|
|
vpdpwssd m3, m12, m1 ; a0 b0 c0 d0
|
|
valignq m0, m2, m0, 6 ; 4 5 6 7
|
|
punpcklwd m0, m2 ; 45 56 67 78
|
|
vpdpwssd m3, m14, m0 ; a2 b2 c2 d2
|
|
vshufi32x4 m1, m0, q1032 ; 23 34 45 56
|
|
vpdpwssd m3, m13, m1 ; a1 b1 c1 d1
|
|
mova m1, m0
|
|
mova m0, m2
|
|
vpermb m3, m4, m3
|
|
mova [tmpq], ym3
|
|
add tmpq, 32
|
|
sub hd, 4
|
|
jg .v_w4_loop
|
|
RET
|
|
.v_w8:
|
|
vbroadcasti32x4 ym1, [srcq+r6 *1]
|
|
mov r3d, 0x33
|
|
vbroadcasti32x4 m2, [srcq+ssq*0]
|
|
kmovb k1, r3d
|
|
mova m6, [spel_v_shuf8]
|
|
vinserti64x2 m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2
|
|
vbroadcasti32x4 ym0, [srcq+ssq*1]
|
|
vinserti64x2 m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4
|
|
mova m7, [prep_endB]
|
|
vpermb m1, m6, m1 ; 01 12
|
|
vpermb m2, m6, m0 ; 23 34
|
|
.v_w8_loop:
|
|
lea srcq, [srcq+ssq*4]
|
|
vbroadcasti32x4 ym3, [srcq+r6 *1]
|
|
movu xm4, [srcq+ssq*0]
|
|
vshufi64x2 m3{k1}, m0, m4, q1032 ; 4 5 6
|
|
vbroadcasti32x4 ym0, [srcq+ssq*1]
|
|
vinserti64x2 m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8
|
|
mova m4, m10
|
|
vpdpwssd m4, m12, m1 ; a0 b0
|
|
mova m5, m10
|
|
vpdpwssd m5, m12, m2 ; c0 d0
|
|
vpermb m1, m6, m3 ; 45 56
|
|
vpdpwssd m4, m13, m2 ; a1 b1
|
|
vpermb m2, m6, m0 ; 67 78
|
|
vpdpwssd m5, m13, m1 ; c1 d1
|
|
vpdpwssd m4, m14, m1 ; a2 b2
|
|
vpdpwssd m5, m14, m2 ; c2 d2
|
|
vpermt2b m4, m7, m5
|
|
mova [tmpq], m4
|
|
add tmpq, 64
|
|
sub hd, 4
|
|
jg .v_w8_loop
|
|
RET
|
|
.v_w16:
|
|
vbroadcasti32x8 m0, [srcq+r6 *1]
|
|
vinserti32x8 m1, m0, [srcq+ssq*0], 1 ; 1 2
|
|
vinserti32x8 m0, [srcq+r6 *2], 0 ; 0 1
|
|
mova m6, [spel_v_shuf16]
|
|
movu ym3, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m3, [srcq+ssq*0], 1 ; 3 4
|
|
mova m7, [prep_endA]
|
|
vpermb m1, m6, m1 ; 12
|
|
vpermb m0, m6, m0 ; 01
|
|
vpermb m3, m6, m3 ; 34
|
|
vpshrdd m2, m1, m3, 16 ; 23
|
|
.v_w16_loop:
|
|
mova m5, m10
|
|
vpdpwssd m5, m12, m1 ; b0
|
|
mova m4, m10
|
|
vpdpwssd m4, m12, m0 ; a0
|
|
mova m1, m3
|
|
vpdpwssd m5, m13, m3 ; b1
|
|
movu ym3, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
vpdpwssd m4, m13, m2 ; a1
|
|
vinserti32x8 m3, [srcq+ssq*0], 1
|
|
mova m0, m2
|
|
vpermb m3, m6, m3 ; 56
|
|
vpshrdd m2, m1, m3, 16 ; 45
|
|
vpdpwssd m5, m14, m3 ; b2
|
|
vpdpwssd m4, m14, m2 ; a2
|
|
vpermt2b m4, m7, m5
|
|
mova [tmpq], m4
|
|
add tmpq, 64
|
|
sub hd, 2
|
|
jg .v_w16_loop
|
|
RET
|
|
.v_w32:
|
|
.v_w64:
|
|
.v_w128:
|
|
%if WIN64
|
|
push r8
|
|
%endif
|
|
mova m11, [prep_endC]
|
|
lea r5, [hq+wq*8-256]
|
|
.v_w32_loop0:
|
|
movu m4, [srcq+r6 *2]
|
|
movu m5, [srcq+r6 *1]
|
|
lea r7, [srcq+ssq*2]
|
|
movu m6, [srcq+ssq*0]
|
|
movu m7, [srcq+ssq*1]
|
|
mov r8, tmpq
|
|
movu m8, [r7 +ssq*0]
|
|
punpcklwd m0, m4, m5 ; 01
|
|
punpckhwd m4, m5
|
|
punpcklwd m1, m5, m6 ; 12
|
|
punpckhwd m5, m6
|
|
punpcklwd m2, m6, m7 ; 23
|
|
punpckhwd m6, m7
|
|
punpcklwd m3, m7, m8 ; 34
|
|
punpckhwd m7, m8
|
|
.v_w32_loop:
|
|
mova m16, m10
|
|
movu m9, [r7+ssq*1]
|
|
mova m18, m10
|
|
vpdpwssd m16, m12, m0 ; a0
|
|
mova m17, m10
|
|
vpdpwssd m18, m12, m4
|
|
mova m19, m10
|
|
vpdpwssd m17, m12, m1 ; b0
|
|
lea r7, [r7+ssq*2]
|
|
vpdpwssd m19, m12, m5
|
|
mova m0, m2
|
|
vpdpwssd m16, m13, m2 ; a1
|
|
punpcklwd m2, m8, m9 ; 45
|
|
mova m4, m6
|
|
vpdpwssd m18, m13, m6
|
|
punpckhwd m6, m8, m9
|
|
movu m8, [r7+ssq*0]
|
|
vpdpwssd m17, m13, m3 ; b1
|
|
mova m1, m3
|
|
vpdpwssd m19, m13, m7
|
|
mova m5, m7
|
|
vpdpwssd m16, m14, m2 ; a2
|
|
punpcklwd m3, m9, m8 ; 56
|
|
vpdpwssd m18, m14, m6
|
|
punpckhwd m7, m9, m8
|
|
vpdpwssd m17, m14, m3 ; b2
|
|
vpdpwssd m19, m14, m7
|
|
vpermt2b m16, m11, m18
|
|
vpermt2b m17, m11, m19
|
|
mova [r8+wq*0], m16
|
|
mova [r8+wq*2], m17
|
|
lea r8, [r8+wq*4]
|
|
sub hd, 2
|
|
jg .v_w32_loop
|
|
add srcq, 64
|
|
add tmpq, 64
|
|
movzx hd, r5b
|
|
sub r5d, 1<<8
|
|
jg .v_w32_loop0
|
|
%if WIN64
|
|
pop r8
|
|
%endif
|
|
vzeroupper
|
|
RET
|
|
.hv_w4:
|
|
movzx mxd, mxb
|
|
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 4
|
|
cmove myd, mxd
|
|
mov r5d, r7m
|
|
pmovsxbw xmm1, [base+subpel_filters+1+myq*8]
|
|
mov r6, ssq
|
|
sub srcq, 2
|
|
shr r5d, 11
|
|
neg r6
|
|
psllw xmm0, [base+prep_hv_shift+r5*8]
|
|
psllw xmm1, 2
|
|
mova [tmpq+ 0], xmm0
|
|
mova [tmpq+16], xmm1
|
|
vpbroadcastd m8, [tmpq+ 4]
|
|
mov r3d, 0xf0
|
|
vpbroadcastd m9, [tmpq+ 8]
|
|
vpbroadcastd m12, xmm1
|
|
movu xm3, [srcq+r6 *2]
|
|
kmovb k1, r3d
|
|
vinserti32x4 ym3, [srcq+r6 *1], 1
|
|
vbroadcasti32x4 m2, [srcq+ssq*0]
|
|
vinserti64x2 m3{k1}, m2, [srcq+ssq*1], 3
|
|
movu xm4, [srcq+ssq*2]
|
|
vbroadcasti32x4 m5, [spel_h_shufA]
|
|
vbroadcasti32x4 m6, [spel_h_shufB]
|
|
mova m1, m11
|
|
mova m15, [spel_shuf4a]
|
|
mova xm2, xm11
|
|
pshufb m0, m3, m5
|
|
vpdpwssd m1, m8, m0
|
|
pshufb xm0, xm4, xm5
|
|
vpdpwssd xm2, xm8, xm0
|
|
vpbroadcastd m13, [tmpq+20]
|
|
pshufb m3, m6
|
|
vpbroadcastd m14, [tmpq+24]
|
|
pshufb xm4, xm6
|
|
mova m7, [spel_shuf4b]
|
|
vpdpwssd m1, m9, m3 ; 0 1 2 3
|
|
vpdpwssd xm2, xm9, xm4 ; 4
|
|
vpermt2b m1, m15, m2 ; 01 12 23 34
|
|
mova ym15, [prep_endA]
|
|
.hv_w4_loop:
|
|
lea srcq, [srcq+ssq*4]
|
|
movu xm4, [srcq+r6 *1]
|
|
vinserti32x4 ym4, [srcq+ssq*0], 1
|
|
vbroadcasti32x4 m3, [srcq+ssq*1]
|
|
vinserti64x2 m4{k1}, m3, [srcq+ssq*2], 3
|
|
mova m2, m11
|
|
pshufb m3, m4, m5
|
|
vpdpwssd m2, m8, m3
|
|
mova m3, m10
|
|
vpdpwssd m3, m12, m1 ; a0 b0 c0 d0
|
|
pshufb m4, m6
|
|
vpdpwssd m2, m9, m4 ; 5 6 7 8
|
|
mova m4, m1
|
|
vpermt2b m1, m7, m2 ; 45 56 67 78
|
|
vpdpwssd m3, m14, m1 ; a2 b2 c2 d2
|
|
vshufi32x4 m4, m1, q1032 ; 23 34 45 56
|
|
vpdpwssd m3, m13, m4 ; a1 b1 c1 d1
|
|
vpermb m3, m15, m3
|
|
mova [tmpq], ym3
|
|
add tmpq, 32
|
|
sub hd, 4
|
|
jg .hv_w4_loop
|
|
RET
|
|
.hv_w8:
|
|
mova m8, [spel_h_shufA]
|
|
movu ym18, [srcq+r6 *2]
|
|
vinserti32x8 m18, [srcq+r6 *1], 1 ; 0 1
|
|
movu ym19, [srcq+ssq*0]
|
|
vinserti32x8 m19, [srcq+ssq*1], 1 ; 2 3
|
|
movu ym20, [srcq+ssq*2] ; 4
|
|
movu m9, [spel_h_shufC]
|
|
mova m21, [spel_shuf8a]
|
|
mova m0, [spel_shuf8b]
|
|
vpermb m4, m8, m18
|
|
mova m1, m10
|
|
vpermb m5, m8, m19
|
|
vpdpwssd m1, m12, m4 ; a0 b0
|
|
mova m2, m10
|
|
vpermb m6, m8, m20
|
|
vpdpwssd m2, m12, m5 ; c0 d0
|
|
mova m3, m10
|
|
vpermb m18, m9, m18
|
|
vpdpwssd m3, m12, m6 ; e0
|
|
mova m7, [prep_endB]
|
|
vpermb m19, m9, m19
|
|
vpdpwssd m1, m14, m18 ; a2 b2
|
|
vpermb m20, m9, m20
|
|
vpdpwssd m2, m14, m19 ; c2 d2
|
|
shufpd m4, m18, 0x55
|
|
vpdpwssd m3, m14, m20 ; e2
|
|
shufpd m5, m19, 0x55
|
|
vpdpwssd m1, m13, m4 ; a1 b1
|
|
shufpd m6, m20, 0x55
|
|
vpdpwssd m2, m13, m5 ; c1 d1
|
|
vpdpwssd m3, m13, m6 ; e1
|
|
vpermt2b m1, m21, m2 ; 01 12
|
|
vpermt2b m2, m21, m3 ; 23 34
|
|
.hv_w8_loop:
|
|
lea srcq, [srcq+ssq*4]
|
|
movu ym18, [srcq+r6 *1]
|
|
vinserti32x8 m18, [srcq+ssq*0], 1
|
|
movu ym19, [srcq+ssq*1]
|
|
vinserti32x8 m19, [srcq+ssq*2], 1
|
|
mova m3, m10
|
|
vpermb m5, m8, m18
|
|
mova m4, m10
|
|
vpermb m6, m8, m19
|
|
vpdpwssd m3, m12, m5 ; f0 g0
|
|
mova m20, m11
|
|
vpdpwssd m4, m12, m6 ; h0 i0
|
|
mova m21, m11
|
|
vpdpwssd m20, m15, m1 ; A0 B0
|
|
vpermb m18, m9, m18
|
|
vpdpwssd m21, m15, m2 ; C0 D0
|
|
vpermb m19, m9, m19
|
|
vpdpwssd m3, m14, m18 ; f2 g2
|
|
vpdpwssd m4, m14, m19 ; h2 i2
|
|
shufpd m5, m18, 0x55
|
|
vpdpwssd m20, m16, m2 ; A1 B1
|
|
shufpd m6, m19, 0x55
|
|
vpdpwssd m3, m13, m5 ; f1 g1
|
|
vpdpwssd m4, m13, m6 ; h1 i1
|
|
vpermt2b m2, m0, m3 ; 45 56
|
|
vpdpwssd m21, m16, m2 ; C1 D1
|
|
mova m1, m2
|
|
vpermt2b m2, m0, m4 ; 67 78
|
|
vpdpwssd m20, m17, m1 ; A2 B2
|
|
vpdpwssd m21, m17, m2 ; A2 B2
|
|
vpermt2b m20, m7, m21
|
|
mova [tmpq], m20
|
|
add tmpq, 64
|
|
sub hd, 4
|
|
jg .hv_w8_loop
|
|
vzeroupper
|
|
RET
|
|
.hv:
|
|
vpbroadcastd m11, [pd_128]
|
|
cmp wd, 4
|
|
je .hv_w4
|
|
shr mxd, 16
|
|
pmovsxbw xmm0, [base+subpel_filters+1+mxq*8]
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 6
|
|
cmovs myd, mxd
|
|
mov r5d, r7m
|
|
pmovsxbw xmm1, [base+subpel_filters+1+myq*8]
|
|
mov r6, ssq
|
|
sub srcq, 4
|
|
shr r5d, 11
|
|
neg r6
|
|
psllw xmm0, [base+prep_hv_shift+r5*8]
|
|
psllw xmm1, 2
|
|
mova [tmpq+ 0], xmm0
|
|
mova [tmpq+16], xmm1
|
|
vpbroadcastd m12, xmm0
|
|
vpbroadcastd m13, [tmpq+ 4]
|
|
vpbroadcastd m14, [tmpq+ 8]
|
|
vpbroadcastd m15, xmm1
|
|
vpbroadcastd m16, [tmpq+20]
|
|
vpbroadcastd m17, [tmpq+24]
|
|
cmp wd, 16
|
|
jl .hv_w8
|
|
vbroadcasti32x4 m8, [spel_h_shufA]
|
|
vbroadcasti32x4 m9, [spel_h_shufB]
|
|
jg .hv_w32
|
|
vbroadcasti32x8 m6, [srcq+r6 *2+ 8]
|
|
vinserti32x8 m2, m6, [srcq+r6 *2+16], 1
|
|
vinserti32x8 m6, [srcq+r6 *2+ 0], 0 ; 0
|
|
movu ym18, [srcq+r6 *1+ 0]
|
|
movu ym19, [srcq+r6 *1+12]
|
|
vinserti32x8 m18, [srcq+ssq*0+ 0], 1
|
|
vinserti32x8 m19, [srcq+ssq*0+12], 1 ; 1 2
|
|
movu ym20, [srcq+ssq*1+ 0]
|
|
movu ym21, [srcq+ssq*1+12]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m20, [srcq+ssq*0+ 0], 1
|
|
vinserti32x8 m21, [srcq+ssq*0+12], 1 ; 3 4
|
|
pshufb m2, m8
|
|
mova m1, m10
|
|
pshufb m3, m18, m8
|
|
vpdpwssd m1, m14, m2 ; a2
|
|
mova m2, m10
|
|
pshufb m4, m19, m9
|
|
vpdpwssd m2, m12, m3 ; b0 c0
|
|
mova m3, m10
|
|
pshufb m5, m20, m8
|
|
vpdpwssd m3, m14, m4 ; b2' c2'
|
|
mova m4, m10
|
|
pshufb m7, m21, m9
|
|
vpdpwssd m4, m12, m5 ; d0 e0
|
|
mova m5, m10
|
|
pshufb m0, m6, m8
|
|
vpdpwssd m5, m14, m7 ; d2' e2'
|
|
mova m7, [spel_shuf16]
|
|
pshufb m18, m9
|
|
vpdpwssd m1, m12, m0 ; a0
|
|
pshufb m19, m8
|
|
vpdpwssd m2, m13, m18 ; b1 c1
|
|
pshufb m20, m9
|
|
vpdpwssd m3, m13, m19 ; b1' c1'
|
|
pshufb m21, m8
|
|
vpdpwssd m4, m13, m20 ; d1 e1
|
|
pshufb m6, m9
|
|
vpdpwssd m5, m13, m21 ; d1' e1'
|
|
mova m0, [prep_endB]
|
|
shufpd m18, m19, 0x55
|
|
vpdpwssd m1, m13, m6 ; a1
|
|
shufpd m20, m21, 0x55
|
|
vpdpwssd m2, m14, m18 ; b2 c2
|
|
vpdpwssd m3, m12, m18 ; b0' c0'
|
|
vpdpwssd m4, m14, m20 ; d2 e2
|
|
vpdpwssd m5, m12, m20 ; d0' e0'
|
|
pslldq m1, 1
|
|
vpermt2b m2, m7, m3 ; 12
|
|
vpermt2b m4, m7, m5 ; 34
|
|
vpshrdd m1, m2, 16 ; 01
|
|
vpshrdd m3, m2, m4, 16 ; 23
|
|
.hv_w16_loop:
|
|
movu ym18, [srcq+ssq*1+ 0]
|
|
movu ym19, [srcq+ssq*1+12]
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m18, [srcq+ssq*0+ 0], 1
|
|
vinserti32x8 m19, [srcq+ssq*0+12], 1
|
|
mova m5, m10
|
|
mova m6, m10
|
|
pshufb m21, m18, m8
|
|
vpdpwssd m5, m12, m21 ; f0 g0
|
|
pshufb m20, m19, m9
|
|
mova m21, m11
|
|
vpdpwssd m6, m14, m20 ; f2' g2'
|
|
mova m20, m11
|
|
vpdpwssd m21, m15, m2 ; B0
|
|
mova m2, m4
|
|
vpdpwssd m20, m15, m1 ; A0
|
|
mova m1, m3
|
|
pshufb m18, m9
|
|
vpdpwssd m5, m13, m18 ; f1 g1
|
|
pshufb m19, m8
|
|
vpdpwssd m6, m13, m19 ; f1' g1'
|
|
vpdpwssd m21, m16, m4 ; B1
|
|
vpdpwssd m20, m16, m3 ; A1
|
|
shufpd m18, m19, 0x55
|
|
vpdpwssd m5, m14, m18 ; f2 g2
|
|
vpdpwssd m6, m12, m18 ; f0' g0'
|
|
mova m4, m7
|
|
vpermi2b m4, m5, m6 ; 56
|
|
vpshrdd m3, m2, m4, 16 ; 45
|
|
vpdpwssd m21, m17, m4 ; B2
|
|
vpdpwssd m20, m17, m3 ; A2
|
|
vpermt2b m20, m0, m21
|
|
mova [tmpq], m20
|
|
add tmpq, 64
|
|
sub hd, 2
|
|
jg .hv_w16_loop
|
|
vzeroupper
|
|
RET
|
|
.hv_w32:
|
|
WIN64_SPILL_XMM 29
|
|
%if WIN64
|
|
push r8
|
|
%endif
|
|
mova m27, [spel_shuf32]
|
|
lea r5d, [hq+wq*8-256]
|
|
mova m28, [prep_endC]
|
|
.hv_w32_loop0:
|
|
movu m18, [srcq+r6 *2+ 0]
|
|
movu m7, [srcq+r6 *2+12]
|
|
movu m6, [srcq+r6 *1+ 0]
|
|
movu m20, [srcq+r6 *1+12]
|
|
lea r7, [srcq+ssq*2]
|
|
movu m19, [srcq+ssq*0+ 0]
|
|
movu m21, [srcq+ssq*0+12]
|
|
movu m22, [srcq+ssq*1+ 0]
|
|
movu m24, [srcq+ssq*1+12]
|
|
mov r8, tmpq
|
|
movu m23, [r7 +ssq*0+ 0]
|
|
movu m25, [r7 +ssq*0+12]
|
|
pshufb m1, m18, m8
|
|
mova m0, m10
|
|
pshufb m2, m7, m9
|
|
vpdpwssd m0, m12, m1 ; a0
|
|
mova m1, m10
|
|
pshufb m4, m6, m8
|
|
vpdpwssd m1, m14, m2 ; a2'
|
|
mova m2, m10
|
|
pshufb m3, m19, m8
|
|
vpdpwssd m2, m12, m4 ; b0
|
|
mova m4, m10
|
|
pshufb m5, m20, m9
|
|
vpdpwssd m4, m12, m3 ; c0
|
|
mova m3, m10
|
|
pshufb m26, m21, m9
|
|
vpdpwssd m3, m14, m5 ; b2'
|
|
mova m5, m10
|
|
pshufb m18, m9
|
|
vpdpwssd m5, m14, m26 ; c2'
|
|
pshufb m7, m8
|
|
vpdpwssd m0, m13, m18 ; a1
|
|
pshufb m6, m9
|
|
vpdpwssd m1, m13, m7 ; a1'
|
|
pshufb m19, m9
|
|
vpdpwssd m2, m13, m6 ; b1
|
|
pshufb m20, m8
|
|
vpdpwssd m4, m13, m19 ; c1
|
|
pshufb m21, m8
|
|
vpdpwssd m3, m13, m20 ; b1'
|
|
shufpd m18, m7, 0x55
|
|
vpdpwssd m5, m13, m21 ; c1'
|
|
shufpd m6, m20, 0x55
|
|
vpdpwssd m0, m14, m18 ; a2
|
|
shufpd m19, m21, 0x55
|
|
vpdpwssd m1, m12, m18 ; a0'
|
|
pshufb m18, m22, m8
|
|
vpdpwssd m2, m14, m6 ; b2
|
|
pshufb m7, m23, m8
|
|
vpdpwssd m4, m14, m19 ; c2
|
|
vpdpwssd m3, m12, m6 ; b0'
|
|
mova m6, m10
|
|
vpdpwssd m5, m12, m19 ; c0'
|
|
pshufb m19, m24, m9
|
|
vpdpwssd m6, m12, m18 ; d0
|
|
mova m18, m10
|
|
pshufb m26, m25, m9
|
|
vpdpwssd m18, m12, m7 ; e0
|
|
mova m7, m10
|
|
pshufb m22, m9
|
|
vpdpwssd m7, m14, m19 ; d2'
|
|
mova m19, m10
|
|
pshufb m23, m9
|
|
vpdpwssd m19, m14, m26 ; e2'
|
|
pshufb m24, m8
|
|
vpdpwssd m6, m13, m22 ; d1
|
|
pshufb m25, m8
|
|
vpdpwssd m18, m13, m23 ; e1
|
|
shufpd m22, m24, 0x55
|
|
vpdpwssd m7, m13, m24 ; d1'
|
|
shufpd m23, m25, 0x55
|
|
vpdpwssd m19, m13, m25 ; e1'
|
|
pslldq m0, 1
|
|
vpdpwssd m6, m14, m22 ; d2
|
|
pslldq m1, 1
|
|
vpdpwssd m18, m14, m23 ; e2
|
|
vpermt2b m2, m27, m4 ; 12
|
|
vpdpwssd m7, m12, m22 ; d0'
|
|
vpermt2b m3, m27, m5 ; 12'
|
|
vpdpwssd m19, m12, m23 ; e0'
|
|
vpshrdd m0, m2, 16 ; 01
|
|
vpermt2b m6, m27, m18 ; 34
|
|
vpshrdd m1, m3, 16 ; 01'
|
|
vpermt2b m7, m27, m19 ; 34'
|
|
vpshrdd m4, m2, m6, 16 ; 23
|
|
vpshrdd m5, m3, m7, 16 ; 23'
|
|
.hv_w32_loop:
|
|
movu m22, [r7+ssq*1+ 0]
|
|
movu m24, [r7+ssq*1+12]
|
|
lea r7, [r7+ssq*2]
|
|
movu m23, [r7+ssq*0+ 0]
|
|
movu m25, [r7+ssq*0+12]
|
|
mova m19, m11
|
|
vpdpwssd m19, m15, m2 ; B0
|
|
mova m21, m11
|
|
vpdpwssd m21, m15, m3 ; B0'
|
|
mova m18, m11
|
|
vpdpwssd m18, m15, m0 ; A0
|
|
mova m20, m11
|
|
vpdpwssd m20, m15, m1 ; A0'
|
|
mova m2, m6
|
|
vpdpwssd m19, m16, m6 ; B1
|
|
mova m3, m7
|
|
vpdpwssd m21, m16, m7 ; B1'
|
|
mova m0, m4
|
|
vpdpwssd m18, m16, m4 ; A1
|
|
mova m1, m5
|
|
pshufb m4, m22, m8
|
|
vpdpwssd m20, m16, m5 ; A1'
|
|
mova m6, m10
|
|
pshufb m7, m23, m8
|
|
vpdpwssd m6, m12, m4 ; f0
|
|
mova m4, m10
|
|
pshufb m5, m24, m9
|
|
vpdpwssd m4, m12, m7 ; g0
|
|
mova m7, m10
|
|
pshufb m26, m25, m9
|
|
vpdpwssd m7, m14, m5 ; f2'
|
|
mova m5, m10
|
|
pshufb m22, m9
|
|
vpdpwssd m5, m14, m26 ; g2'
|
|
pshufb m23, m9
|
|
vpdpwssd m6, m13, m22 ; f1
|
|
pshufb m24, m8
|
|
vpdpwssd m4, m13, m23 ; g1
|
|
pshufb m25, m8
|
|
vpdpwssd m7, m13, m24 ; f1'
|
|
shufpd m22, m24, 0x55
|
|
vpdpwssd m5, m13, m25 ; g1'
|
|
shufpd m23, m25, 0x55
|
|
vpdpwssd m6, m14, m22 ; f2
|
|
vpdpwssd m4, m14, m23 ; g2
|
|
vpdpwssd m7, m12, m22 ; f0'
|
|
vpdpwssd m5, m12, m23 ; g0'
|
|
vpermt2b m6, m27, m4 ; 56
|
|
vpermt2b m7, m27, m5 ; 56'
|
|
vpdpwssd m19, m17, m6 ; B2
|
|
vpshrdd m4, m2, m6, 16 ; 45
|
|
vpdpwssd m21, m17, m7 ; B2'
|
|
vpshrdd m5, m3, m7, 16 ; 45'
|
|
vpdpwssd m18, m17, m4 ; A2
|
|
vpdpwssd m20, m17, m5 ; A2'
|
|
vpermt2b m19, m28, m21
|
|
vpermt2b m18, m28, m20
|
|
mova [r8+wq*0], m18
|
|
mova [r8+wq*2], m19
|
|
lea r8, [r8+wq*4]
|
|
sub hd, 2
|
|
jg .hv_w32_loop
|
|
add srcq, 64
|
|
add tmpq, 64
|
|
movzx hd, r5b
|
|
sub r5d, 1<<8
|
|
jg .hv_w32_loop0
|
|
%if WIN64
|
|
pop r8
|
|
%endif
|
|
RET
|
|
|
|
PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP, prep_8tap_16bpc
|
|
PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH, prep_8tap_16bpc
|
|
PREP_8TAP_FN regular_sharp, REGULAR, SHARP, prep_8tap_16bpc
|
|
PREP_8TAP_FN sharp_regular, SHARP, REGULAR, prep_8tap_16bpc
|
|
PREP_8TAP_FN sharp, SHARP, SHARP
|
|
|
|
cglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my
|
|
%define base r7-prep_avx512icl
|
|
imul mxd, mxm, 0x010101
|
|
add mxd, t0d ; 8tap_h, mx, 4tap_h
|
|
imul myd, mym, 0x010101
|
|
add myd, t1d ; 8tap_v, my, 4tap_v
|
|
lea r7, [prep_avx512icl]
|
|
mov wd, wm
|
|
movifnidn hd, hm
|
|
test mxd, 0xf00
|
|
jnz .h
|
|
test myd, 0xf00
|
|
jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep
|
|
.v:
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 4
|
|
cmove myd, mxd
|
|
mov r5d, r7m
|
|
vpbroadcastd m10, [prep_8tap_rnd]
|
|
pmovsxbw xmm0, [base+subpel_filters+myq*8]
|
|
tzcnt r6d, wd
|
|
shr r5d, 11
|
|
movzx r6d, word [r7+r6*2+table_offset(prep, _8tap_v)]
|
|
psllw xmm0, [base+prep_hv_shift+r5*8]
|
|
add r7, r6
|
|
lea r6, [strideq*3]
|
|
sub srcq, r6
|
|
mova [tmpq], xmm0
|
|
vpbroadcastd m12, xmm0
|
|
vpbroadcastd m13, [tmpq+ 4]
|
|
vpbroadcastd m14, [tmpq+ 8]
|
|
vpbroadcastd m15, [tmpq+12]
|
|
jmp r7
|
|
.v_w4:
|
|
mov r3d, 0x330c
|
|
movq xm1, [srcq+strideq*0]
|
|
kmovw k1, r3d
|
|
vpbroadcastq ym1{k1}, [srcq+strideq*1]
|
|
vpbroadcastq m0, [srcq+r6 ]
|
|
vinserti32x4 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3
|
|
lea srcq, [srcq+strideq*4]
|
|
vpbroadcastq ym0{k1}, [srcq+strideq*0]
|
|
vpbroadcastq m2, [srcq+strideq*1]
|
|
vinserti32x4 m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6
|
|
mova ym5, [prep_endA]
|
|
vshufi32x4 m3, m1, m0, q1021 ; 1 2 3 4
|
|
vshufi32x4 m2, m1, m0, q2132 ; 2 3 4 5
|
|
punpcklwd m1, m3 ; 01 12 23 34
|
|
punpcklwd m2, m0 ; 23 34 45 56
|
|
.v_w4_loop:
|
|
movq xm4, [srcq+r6 ]
|
|
lea srcq, [srcq+strideq*4]
|
|
vpbroadcastq ym4{k1}, [srcq+strideq*0]
|
|
vpbroadcastq m3, [srcq+strideq*1]
|
|
vinserti32x4 m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a
|
|
mova m3, m10
|
|
vpdpwssd m3, m12, m1 ; a0 b0 c0 d0
|
|
valignq m1, m4, m0, 6 ; 6 7 8 9
|
|
vpdpwssd m3, m13, m2 ; a1 b1 c1 d1
|
|
mova m0, m4
|
|
punpcklwd m4, m1, m4 ; 67 78 89 9a
|
|
vpdpwssd m3, m15, m4 ; a3 b3 c3 d3
|
|
vshufi32x4 m1, m2, m4, q1032 ; 45 56 67 78
|
|
vpdpwssd m3, m14, m1 ; a2 b2 c2 d2
|
|
mova m2, m4
|
|
vpermb m3, m5, m3
|
|
mova [tmpq], ym3
|
|
add tmpq, 32
|
|
sub hd, 4
|
|
jg .v_w4_loop
|
|
RET
|
|
.v_w8:
|
|
movu xm0, [srcq+strideq*0]
|
|
mov r3d, 0x33
|
|
vbroadcasti32x4 ym1, [srcq+strideq*1]
|
|
kmovb k1, r3d
|
|
mova m7, [spel_v_shuf8]
|
|
vinserti64x2 m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2
|
|
add srcq, r6
|
|
vbroadcasti32x4 ym2, [srcq+strideq*0]
|
|
vbroadcasti32x4 m3, [srcq+strideq*1]
|
|
vbroadcasti32x4 ym0, [srcq+strideq*2]
|
|
vshufi64x2 m2{k1}, m1, m3, q1032 ; 2 3 4
|
|
vinserti64x2 m0{k1}, m3, [srcq+r6], 2 ; 4 5 6
|
|
mova m8, [prep_endB]
|
|
vpermb m1, m7, m1 ; 01 12
|
|
vpermb m2, m7, m2 ; 23 34
|
|
vpermb m3, m7, m0 ; 45 56
|
|
.v_w8_loop:
|
|
lea srcq, [srcq+strideq*4]
|
|
vbroadcasti32x4 ym4, [srcq+strideq*0]
|
|
movu xm5, [srcq+strideq*1]
|
|
vshufi64x2 m4{k1}, m0, m5, q1032 ; 6 7 8
|
|
vbroadcasti32x4 ym0, [srcq+strideq*2]
|
|
vinserti64x2 m0{k1}, m5, [srcq+r6], 2 ; 8 9 a
|
|
mova m5, m10
|
|
vpdpwssd m5, m12, m1 ; a0 b0
|
|
mova m6, m10
|
|
vpdpwssd m6, m12, m2 ; c0 d0
|
|
mova m1, m3
|
|
vpdpwssd m5, m13, m2 ; c1 d1
|
|
vpdpwssd m6, m13, m3 ; c1 d1
|
|
vpermb m2, m7, m4 ; 67 78
|
|
vpdpwssd m5, m14, m3 ; a2 b2
|
|
vpermb m3, m7, m0 ; 89 9a
|
|
vpdpwssd m6, m14, m2 ; c2 d2
|
|
vpdpwssd m5, m15, m2 ; a3 b3
|
|
vpdpwssd m6, m15, m3 ; c3 d3
|
|
vpermt2b m5, m8, m6
|
|
mova [tmpq], m5
|
|
add tmpq, 64
|
|
sub hd, 4
|
|
jg .v_w8_loop
|
|
RET
|
|
.v_w16:
|
|
vbroadcasti32x8 m0, [srcq+strideq*1]
|
|
vinserti32x8 m1, m0, [srcq+strideq*2], 1
|
|
vinserti32x8 m0, [srcq+strideq*0], 0
|
|
mova m8, [spel_v_shuf16]
|
|
add srcq, r6
|
|
movu ym3, [srcq+strideq*0]
|
|
vinserti32x8 m3, [srcq+strideq*1], 1
|
|
movu ym5, [srcq+strideq*2]
|
|
add srcq, r6
|
|
vinserti32x8 m5, [srcq+strideq*0], 1
|
|
mova m11, [prep_endA]
|
|
vpermb m1, m8, m1 ; 12
|
|
vpermb m0, m8, m0 ; 01
|
|
vpermb m3, m8, m3 ; 34
|
|
vpermb m5, m8, m5 ; 56
|
|
vpshrdd m2, m1, m3, 16 ; 23
|
|
vpshrdd m4, m3, m5, 16 ; 45
|
|
.v_w16_loop:
|
|
mova m7, m10
|
|
vpdpwssd m7, m12, m1 ; b0
|
|
mova m6, m10
|
|
vpdpwssd m6, m12, m0 ; a0
|
|
mova m1, m3
|
|
vpdpwssd m7, m13, m3 ; b1
|
|
mova m0, m2
|
|
vpdpwssd m6, m13, m2 ; a1
|
|
mova m3, m5
|
|
vpdpwssd m7, m14, m5 ; b2
|
|
mova m2, m4
|
|
vpdpwssd m6, m14, m4 ; a2
|
|
movu ym5, [srcq+strideq*1]
|
|
lea srcq, [srcq+strideq*2]
|
|
vinserti32x8 m5, [srcq+strideq*0], 1
|
|
vpermb m5, m8, m5 ; 78
|
|
vpshrdd m4, m3, m5, 16 ; 67
|
|
vpdpwssd m7, m15, m5 ; b3
|
|
vpdpwssd m6, m15, m4 ; a3
|
|
vpermt2b m6, m11, m7
|
|
mova [tmpq], m6
|
|
add tmpq, 64
|
|
sub hd, 2
|
|
jg .v_w16_loop
|
|
RET
|
|
.v_w32:
|
|
.v_w64:
|
|
.v_w128:
|
|
WIN64_PUSH_XMM 23
|
|
%if WIN64
|
|
push r8
|
|
%endif
|
|
mova m11, [prep_endC]
|
|
lea r5, [hq+wq*8-256]
|
|
.v_w32_loop0:
|
|
movu m16, [srcq+strideq*0]
|
|
movu m17, [srcq+strideq*1]
|
|
lea r7, [srcq+r6]
|
|
movu m18, [srcq+strideq*2]
|
|
movu m19, [r7 +strideq*0]
|
|
mov r8, tmpq
|
|
movu m20, [r7 +strideq*1]
|
|
movu m21, [r7 +strideq*2]
|
|
add r7, r6
|
|
movu m22, [r7 +strideq*0]
|
|
punpcklwd m0, m16, m17 ; 01l
|
|
punpckhwd m16, m17 ; 01h
|
|
punpcklwd m1, m17, m18 ; 12l
|
|
punpckhwd m17, m18 ; 12h
|
|
punpcklwd m2, m18, m19 ; 23l
|
|
punpckhwd m18, m19 ; 23h
|
|
punpcklwd m3, m19, m20 ; 34l
|
|
punpckhwd m19, m20 ; 34h
|
|
punpcklwd m4, m20, m21 ; 45l
|
|
punpckhwd m20, m21 ; 45h
|
|
punpcklwd m5, m21, m22 ; 56l
|
|
punpckhwd m21, m22 ; 56h
|
|
.v_w32_loop:
|
|
mova m6, m10
|
|
vpdpwssd m6, m12, m0 ; a0l
|
|
mova m8, m10
|
|
vpdpwssd m8, m12, m16 ; a0h
|
|
mova m7, m10
|
|
vpdpwssd m7, m12, m1 ; b0l
|
|
mova m9, m10
|
|
vpdpwssd m9, m12, m17 ; b0h
|
|
mova m0, m2
|
|
vpdpwssd m6, m13, m2 ; a1l
|
|
mova m16, m18
|
|
vpdpwssd m8, m13, m18 ; a1h
|
|
mova m1, m3
|
|
vpdpwssd m7, m13, m3 ; b1l
|
|
mova m17, m19
|
|
vpdpwssd m9, m13, m19 ; b1h
|
|
mova m2, m4
|
|
vpdpwssd m6, m14, m4 ; a2l
|
|
mova m18, m20
|
|
vpdpwssd m8, m14, m20 ; a2h
|
|
mova m3, m5
|
|
vpdpwssd m7, m14, m5 ; b2l
|
|
mova m19, m21
|
|
vpdpwssd m9, m14, m21 ; b2h
|
|
movu m21, [r7+strideq*1]
|
|
lea r7, [r7+strideq*2]
|
|
punpcklwd m4, m22, m21 ; 67l
|
|
punpckhwd m20, m22, m21 ; 67h
|
|
movu m22, [r7+strideq*0]
|
|
vpdpwssd m6, m15, m4 ; a3l
|
|
vpdpwssd m8, m15, m20 ; a3h
|
|
punpcklwd m5, m21, m22 ; 78l
|
|
punpckhwd m21, m22 ; 78h
|
|
vpdpwssd m7, m15, m5 ; b3l
|
|
vpdpwssd m9, m15, m21 ; b3h
|
|
vpermt2b m6, m11, m8
|
|
vpermt2b m7, m11, m9
|
|
mova [r8+wq*0], m6
|
|
mova [r8+wq*2], m7
|
|
lea r8, [r8+wq*4]
|
|
sub hd, 2
|
|
jg .v_w32_loop
|
|
add srcq, 64
|
|
add tmpq, 64
|
|
movzx hd, r5b
|
|
sub r5d, 1<<8
|
|
jg .v_w32_loop0
|
|
%if WIN64
|
|
pop r8
|
|
%endif
|
|
RET
|
|
.h_w4:
|
|
RESET_STACK_STATE
|
|
movzx mxd, mxb
|
|
sub srcq, 2
|
|
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
|
|
mov r5d, r7m
|
|
vbroadcasti32x4 m4, [spel_h_shufA]
|
|
vbroadcasti32x4 m5, [spel_h_shufB]
|
|
shr r5d, 11
|
|
mova ym9, [prep_endA]
|
|
psllw xmm0, [base+prep_hv_shift+r5*8]
|
|
mova [tmpq], xmm0
|
|
vpbroadcastd m6, [tmpq+4]
|
|
vpbroadcastd m7, [tmpq+8]
|
|
.h_w4_loop:
|
|
movu xm2, [srcq+strideq*0]
|
|
vinserti32x4 ym2, [srcq+strideq*1], 1
|
|
vinserti32x4 m2, [srcq+strideq*2], 2
|
|
vinserti32x4 m2, [srcq+r6 ], 3
|
|
lea srcq, [srcq+strideq*4]
|
|
mova m0, m10
|
|
pshufb m1, m2, m4
|
|
vpdpwssd m0, m6, m1
|
|
pshufb m2, m5
|
|
vpdpwssd m0, m7, m2
|
|
vpermb m0, m9, m0
|
|
mova [tmpq], ym0
|
|
add tmpq, 32
|
|
sub hd, 4
|
|
jg .h_w4_loop
|
|
RET
|
|
.h_w8:
|
|
mova m6, [spel_h_shufA]
|
|
movu m7, [spel_h_shufB]
|
|
movu m8, [spel_h_shufC]
|
|
mova m9, [spel_h_shufD]
|
|
mova m11, [prep_endB]
|
|
.h_w8_loop:
|
|
movu ym4, [srcq+strideq*0]
|
|
vinserti32x8 m4, [srcq+strideq*1], 1
|
|
movu ym5, [srcq+strideq*2]
|
|
vinserti32x8 m5, [srcq+r6 ], 1
|
|
lea srcq, [srcq+strideq*4]
|
|
mova m0, m10
|
|
mova m1, m10
|
|
vpermb m2, m6, m4
|
|
vpermb m3, m6, m5
|
|
vpdpwssd m0, m12, m2
|
|
vpdpwssd m1, m12, m3
|
|
vpermb m2, m7, m4
|
|
vpermb m3, m7, m5
|
|
vpdpwssd m0, m13, m2
|
|
vpdpwssd m1, m13, m3
|
|
vpermb m2, m8, m4
|
|
vpermb m3, m8, m5
|
|
vpdpwssd m0, m14, m2
|
|
vpdpwssd m1, m14, m3
|
|
vpermb m2, m9, m4
|
|
vpermb m3, m9, m5
|
|
vpdpwssd m0, m15, m2
|
|
vpdpwssd m1, m15, m3
|
|
vpermt2b m0, m11, m1
|
|
mova [tmpq], m0
|
|
add tmpq, 64
|
|
sub hd, 4
|
|
jg .h_w8_loop
|
|
RET
|
|
.h:
|
|
vpbroadcastd m10, [prep_8tap_rnd]
|
|
test myd, 0xf00
|
|
jnz .hv
|
|
lea r6, [strideq*3]
|
|
cmp wd, 4
|
|
je .h_w4
|
|
shr mxd, 16
|
|
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
|
|
mov r5d, r7m
|
|
sub srcq, 6
|
|
shr r5d, 11
|
|
psllw xmm0, [base+prep_hv_shift+r5*8]
|
|
mova [tmpq], xmm0
|
|
vpbroadcastd m12, xmm0
|
|
vpbroadcastd m13, [tmpq+ 4]
|
|
vpbroadcastd m14, [tmpq+ 8]
|
|
vpbroadcastd m15, [tmpq+12]
|
|
cmp wd, 16
|
|
jl .h_w8
|
|
vbroadcasti32x4 m6, [spel_h_shufA]
|
|
vbroadcasti32x4 m7, [spel_h_shufB]
|
|
mova m11, [prep_endC]
|
|
jg .h_w32
|
|
.h_w16_loop:
|
|
movu ym2, [srcq+strideq*0+ 0]
|
|
vinserti32x8 m2, [srcq+strideq*1+ 0], 1
|
|
movu ym3, [srcq+strideq*0+16]
|
|
vinserti32x8 m3, [srcq+strideq*1+16], 1
|
|
lea srcq, [srcq+strideq*2]
|
|
mova m0, m10
|
|
mova m1, m10
|
|
pshufb m4, m2, m6
|
|
vpdpwssd m0, m12, m4 ; a0
|
|
pshufb m4, m3, m6
|
|
vpdpwssd m1, m14, m4 ; b2
|
|
pshufb m4, m2, m7
|
|
vpdpwssd m0, m13, m4 ; a1
|
|
pshufb m4, m3, m7
|
|
vpdpwssd m1, m15, m4 ; b3
|
|
shufpd m2, m3, 0x55
|
|
pshufb m4, m2, m6
|
|
vpdpwssd m0, m14, m4 ; a2
|
|
vpdpwssd m1, m12, m4 ; b0
|
|
pshufb m2, m7
|
|
vpdpwssd m0, m15, m2 ; a3
|
|
vpdpwssd m1, m13, m2 ; b1
|
|
vpermt2b m0, m11, m1
|
|
mova [tmpq], m0
|
|
add tmpq, 64
|
|
sub hd, 2
|
|
jg .h_w16_loop
|
|
RET
|
|
.h_w32:
|
|
lea srcq, [srcq+wq*2]
|
|
neg wq
|
|
.h_w32_loop0:
|
|
mov r6, wq
|
|
.h_w32_loop:
|
|
movu m2, [srcq+r6*2+ 0]
|
|
movu m3, [srcq+r6*2+ 8]
|
|
mova m0, m10
|
|
mova m1, m10
|
|
pshufb m4, m2, m6
|
|
vpdpwssd m0, m12, m4 ; a0
|
|
pshufb m4, m3, m6
|
|
vpdpwssd m1, m12, m4 ; b0
|
|
vpdpwssd m0, m14, m4 ; a2
|
|
movu m4, [srcq+r6*2+16]
|
|
pshufb m3, m7
|
|
vpdpwssd m1, m13, m3 ; b1
|
|
vpdpwssd m0, m15, m3 ; a3
|
|
pshufb m3, m4, m6
|
|
vpdpwssd m1, m14, m3 ; b2
|
|
pshufb m2, m7
|
|
vpdpwssd m0, m13, m2 ; a1
|
|
pshufb m4, m7
|
|
vpdpwssd m1, m15, m4 ; b3
|
|
vpermt2b m0, m11, m1
|
|
mova [tmpq], m0
|
|
add tmpq, 64
|
|
add r6, 32
|
|
jl .h_w32_loop
|
|
add srcq, strideq
|
|
dec hd
|
|
jg .h_w32_loop0
|
|
RET
|
|
.hv:
|
|
vpbroadcastd m11, [pd_128]
|
|
cmp wd, 4
|
|
jg .hv_w8
|
|
movzx mxd, mxb
|
|
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 4
|
|
cmove myd, mxd
|
|
mov r5d, r7m
|
|
pmovsxbw xmm1, [base+subpel_filters+myq*8]
|
|
lea r6, [strideq*3]
|
|
sub srcq, 2
|
|
shr r5d, 11
|
|
sub srcq, r6
|
|
psllw xmm0, [base+prep_hv_shift+r5*8]
|
|
psllw xmm1, 2
|
|
mova [tmpq+ 0], xmm0
|
|
mova [tmpq+16], xmm1
|
|
vpbroadcastd m12, xmm1
|
|
movu xm16, [srcq+strideq*0]
|
|
mov r3d, 0xff0
|
|
vinserti128 ym16, [srcq+strideq*1], 1
|
|
kmovw k1, r3d
|
|
vbroadcasti32x4 m18, [srcq+strideq*2]
|
|
add srcq, r6
|
|
vinserti64x2 m16{k1}, m18, [srcq+strideq*0], 3
|
|
movu xm17, [srcq+strideq*1]
|
|
vbroadcasti32x4 ym18, [srcq+strideq*2]
|
|
add srcq, r6
|
|
vinserti32x4 m17{k1}, m18, [srcq+strideq*0], 2
|
|
vbroadcasti32x4 m5, [spel_h_shufA]
|
|
vbroadcasti32x4 m6, [spel_h_shufB]
|
|
vpbroadcastd m8, [tmpq+ 4]
|
|
vpbroadcastd m9, [tmpq+ 8]
|
|
mova m1, m10
|
|
mova m19, [spel_shuf4a]
|
|
mova m2, m10
|
|
pshufb m0, m16, m5
|
|
vpdpwssd m1, m8, m0
|
|
pshufb m0, m17, m5
|
|
vpdpwssd m2, m8, m0
|
|
vpbroadcastd m13, [tmpq+20]
|
|
pshufb m16, m6
|
|
vpbroadcastd m14, [tmpq+24]
|
|
pshufb m17, m6
|
|
vpbroadcastd m15, [tmpq+28]
|
|
vpdpwssd m1, m9, m16 ; 0 1 2 3
|
|
vpdpwssd m2, m9, m17 ; 4 5 6
|
|
mova m7, [spel_shuf4b]
|
|
vpermt2b m1, m19, m2 ; 01 12 23 34
|
|
vpermb m2, m19, m2 ; 45 56
|
|
mova ym19, [prep_endA]
|
|
vshufi32x4 m2, m1, m2, q1032 ; 23 34 45 56
|
|
.hv_w4_loop:
|
|
movu xm17, [srcq+strideq*1]
|
|
vinserti128 ym17, [srcq+strideq*2], 1
|
|
vbroadcasti32x4 m16, [srcq+r6 ]
|
|
lea srcq, [srcq+strideq*4]
|
|
vinserti64x2 m17{k1}, m16, [srcq+strideq*0], 3
|
|
mova m18, m10
|
|
pshufb m16, m17, m5
|
|
vpdpwssd m18, m8, m16
|
|
mova m16, m11
|
|
vpdpwssd m16, m12, m1 ; a0 b0 c0 d0
|
|
pshufb m17, m6
|
|
vpdpwssd m18, m9, m17 ; 7 8 9 a
|
|
mova m1, m2
|
|
vpdpwssd m16, m13, m2 ; a1 b1 c1 d1
|
|
vpermt2b m2, m7, m18 ; 67 78 89 9a
|
|
vpdpwssd m16, m15, m2 ; a3 b3 c3 d3
|
|
vshufi32x4 m1, m2, q1032 ; 45 56 67 78
|
|
vpdpwssd m16, m14, m1 ; a2 b2 c2 d2
|
|
vpermb m16, m19, m16
|
|
mova [tmpq], ym16
|
|
add tmpq, 32
|
|
sub hd, 4
|
|
jg .hv_w4_loop
|
|
vzeroupper
|
|
RET
|
|
.hv_w8:
|
|
shr mxd, 16
|
|
pmovsxbw xmm0, [base+subpel_filters+mxq*8]
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 6
|
|
cmovs myd, mxd
|
|
mov r5d, r7m
|
|
pmovsxbw xmm1, [base+subpel_filters+myq*8]
|
|
lea r6, [strideq*3]
|
|
sub srcq, 6
|
|
shr r5d, 11
|
|
sub srcq, r6
|
|
psllw xmm0, [base+prep_hv_shift+r5*8]
|
|
psllw xmm1, 2
|
|
mova [tmpq+ 0], xmm0
|
|
mova [tmpq+16], xmm1
|
|
vpbroadcastd m12, xmm0
|
|
vpbroadcastd m13, [tmpq+ 4]
|
|
vpbroadcastd m14, [tmpq+ 8]
|
|
vpbroadcastd m15, [tmpq+12]
|
|
vpbroadcastd m16, xmm1
|
|
vpbroadcastd m17, [tmpq+20]
|
|
vpbroadcastd m18, [tmpq+24]
|
|
vpbroadcastd m19, [tmpq+28]
|
|
cmp wd, 8
|
|
jg .hv_w16
|
|
WIN64_SPILL_XMM 23
|
|
mova m5, [spel_h_shufA]
|
|
movu ym0, [srcq+strideq*0]
|
|
vinserti32x8 m0, [srcq+strideq*1], 1 ; 0 1
|
|
movu ym9, [srcq+strideq*2]
|
|
add srcq, r6
|
|
vinserti32x8 m9, [srcq+strideq*0], 1 ; 2 3
|
|
movu ym20, [srcq+strideq*1]
|
|
vinserti32x8 m20, [srcq+strideq*2], 1 ; 4 5
|
|
add srcq, r6
|
|
movu ym21, [srcq+strideq*0] ; 6
|
|
movu m6, [spel_h_shufB]
|
|
movu m7, [spel_h_shufC]
|
|
mova ym22, [prep_endB]
|
|
vpermb m8, m5, m0
|
|
mova m1, m10
|
|
vpdpwssd m1, m12, m8 ; a0 b0
|
|
vpermb m8, m5, m9
|
|
mova m2, m10
|
|
vpdpwssd m2, m12, m8 ; c0 d0
|
|
vpermb m8, m5, m20
|
|
mova m3, m10
|
|
vpdpwssd m3, m12, m8 ; e0 f0
|
|
vpermb m8, m5, m21
|
|
mova m4, m10
|
|
vpdpwssd m4, m12, m8 ; g0
|
|
vpermb m8, m6, m0
|
|
vpdpwssd m1, m13, m8 ; a1 b1
|
|
vpermb m8, m6, m9
|
|
vpdpwssd m2, m13, m8 ; c1 d1
|
|
vpermb m8, m6, m20
|
|
vpdpwssd m3, m13, m8 ; e1 f1
|
|
vpermb m8, m6, m21
|
|
vpdpwssd m4, m13, m8 ; g1
|
|
vpermb m8, m7, m0
|
|
vpdpwssd m1, m14, m8 ; a2 b2
|
|
vpermb m8, m7, m9
|
|
vpdpwssd m2, m14, m8 ; c2 d2
|
|
vpermb m8, m7, m20
|
|
vpdpwssd m3, m14, m8 ; e2 f2
|
|
vpermb m8, m7, m21
|
|
vpdpwssd m4, m14, m8 ; g2
|
|
mova m8, [spel_h_shufD]
|
|
vpermb m0, m8, m0
|
|
vpdpwssd m1, m15, m0 ; a3 b3
|
|
mova m0, [spel_shuf8a]
|
|
vpermb m9, m8, m9
|
|
vpdpwssd m2, m15, m9 ; c3 d3
|
|
mova m9, [spel_shuf8b]
|
|
vpermb m20, m8, m20
|
|
vpdpwssd m3, m15, m20 ; e3 f3
|
|
vpermb m21, m8, m21
|
|
vpdpwssd m4, m15, m21 ; g3
|
|
vpermt2b m1, m0, m2 ; 01 12
|
|
vpermt2b m2, m0, m3 ; 23 34
|
|
vpermt2b m3, m0, m4 ; 45 56
|
|
.hv_w8_loop:
|
|
movu ym0, [srcq+strideq*1]
|
|
lea srcq, [srcq+strideq*2]
|
|
vinserti32x8 m0, [srcq+strideq*0], 1
|
|
mova m4, m10
|
|
mova m20, m11
|
|
vpermb m21, m5, m0
|
|
vpdpwssd m4, m12, m21 ; h0 i0
|
|
vpermb m21, m6, m0
|
|
vpdpwssd m20, m16, m1 ; A0 B0
|
|
vpdpwssd m4, m13, m21 ; h1 i1
|
|
vpermb m21, m7, m0
|
|
mova m1, m2
|
|
vpdpwssd m20, m17, m2 ; A1 B1
|
|
vpdpwssd m4, m14, m21 ; h2 i2
|
|
vpermb m21, m8, m0
|
|
mova m2, m3
|
|
vpdpwssd m20, m18, m3 ; A2 B2
|
|
vpdpwssd m4, m15, m21 ; h3 i3
|
|
vpermt2b m3, m9, m4 ; 67 78
|
|
vpdpwssd m20, m19, m3 ; A3 B3
|
|
vpermb m20, m22, m20
|
|
mova [tmpq], ym20
|
|
add tmpq, 32
|
|
sub hd, 2
|
|
jg .hv_w8_loop
|
|
RET
|
|
.hv_w16:
|
|
WIN64_SPILL_XMM 27
|
|
%if WIN64
|
|
push r8
|
|
%endif
|
|
vbroadcasti32x4 m20, [spel_h_shufA]
|
|
vbroadcasti32x4 m21, [spel_h_shufB]
|
|
add wd, wd
|
|
mova m9, [spel_shuf16]
|
|
mova m26, [prep_endB]
|
|
lea r5d, [hq+wq*8-256]
|
|
.hv_w16_loop0:
|
|
vbroadcasti32x8 m5, [srcq+strideq*0+ 8]
|
|
vinserti32x8 m4, m5, [srcq+strideq*0+ 0], 0
|
|
vinserti32x8 m5, [srcq+strideq*0+16], 1 ; 0
|
|
movu ym6, [srcq+strideq*1+ 0]
|
|
movu ym7, [srcq+strideq*1+16]
|
|
lea r7, [srcq+r6]
|
|
vinserti32x8 m6, [srcq+strideq*2+ 0], 1
|
|
vinserti32x8 m7, [srcq+strideq*2+16], 1 ; 1 2
|
|
movu ym22, [r7 +strideq*0+ 0]
|
|
movu ym23, [r7 +strideq*0+16]
|
|
mov r8, tmpq
|
|
vinserti32x8 m22, [r7 +strideq*1+ 0], 1
|
|
vinserti32x8 m23, [r7 +strideq*1+16], 1 ; 3 4
|
|
movu ym24, [r7 +strideq*2+ 0]
|
|
movu ym25, [r7 +strideq*2+16]
|
|
add r7, r6
|
|
vinserti32x8 m24, [r7 +strideq*0+ 0], 1
|
|
vinserti32x8 m25, [r7 +strideq*0+16], 1 ; 5 6
|
|
pshufb m0, m4, m20
|
|
mova m1, m10
|
|
vpdpwssd m1, m12, m0 ; a0
|
|
pshufb m0, m6, m20
|
|
mova m2, m10
|
|
vpdpwssd m2, m12, m0 ; b0
|
|
pshufb m0, m7, m20
|
|
mova m3, m10
|
|
vpdpwssd m3, m14, m0 ; c2
|
|
pshufb m0, m4, m21
|
|
vpdpwssd m1, m13, m0 ; a1
|
|
pshufb m0, m6, m21
|
|
vpdpwssd m2, m13, m0 ; b1
|
|
pshufb m0, m7, m21
|
|
vpdpwssd m3, m15, m0 ; c3
|
|
pshufb m0, m5, m20
|
|
vpdpwssd m1, m14, m0 ; a2
|
|
shufpd m6, m7, 0x55
|
|
pshufb m7, m6, m20
|
|
vpdpwssd m2, m14, m7 ; b2
|
|
vpdpwssd m3, m12, m7 ; c0
|
|
pshufb m5, m21
|
|
vpdpwssd m1, m15, m5 ; a3
|
|
pshufb m6, m21
|
|
vpdpwssd m2, m15, m6 ; b3
|
|
vpdpwssd m3, m13, m6 ; c1
|
|
pshufb m0, m22, m20
|
|
mova m4, m10
|
|
vpdpwssd m4, m12, m0 ; d0
|
|
pshufb m0, m23, m20
|
|
mova m5, m10
|
|
vpdpwssd m5, m14, m0 ; e2
|
|
pshufb m0, m24, m20
|
|
mova m6, m10
|
|
vpdpwssd m6, m12, m0 ; f0
|
|
pshufb m0, m25, m20
|
|
mova m7, m10
|
|
vpdpwssd m7, m14, m0 ; g2
|
|
pshufb m0, m22, m21
|
|
vpdpwssd m4, m13, m0 ; d1
|
|
pshufb m0, m23, m21
|
|
vpdpwssd m5, m15, m0 ; e3
|
|
pshufb m0, m24, m21
|
|
vpdpwssd m6, m13, m0 ; f1
|
|
pshufb m0, m25, m21
|
|
vpdpwssd m7, m15, m0 ; g3
|
|
shufpd m22, m23, 0x55
|
|
pshufb m23, m22, m20
|
|
vpdpwssd m4, m14, m23 ; d2
|
|
vpdpwssd m5, m12, m23 ; e0
|
|
shufpd m24, m25, 0x55
|
|
pshufb m25, m24, m20
|
|
vpdpwssd m6, m14, m25 ; f2
|
|
vpdpwssd m7, m12, m25 ; g0
|
|
pshufb m22, m21
|
|
vpdpwssd m4, m15, m22 ; d3
|
|
vpdpwssd m5, m13, m22 ; e1
|
|
pshufb m24, m21
|
|
vpdpwssd m6, m15, m24 ; f3
|
|
vpdpwssd m7, m13, m24 ; g1
|
|
pslldq m1, 1
|
|
vpermt2b m2, m9, m3 ; 12
|
|
vpermt2b m4, m9, m5 ; 34
|
|
vpermt2b m6, m9, m7 ; 56
|
|
vpshrdd m1, m2, 16 ; 01
|
|
vpshrdd m3, m2, m4, 16 ; 23
|
|
vpshrdd m5, m4, m6, 16 ; 45
|
|
.hv_w16_loop:
|
|
movu ym24, [r7+strideq*1+ 0]
|
|
movu ym25, [r7+strideq*1+16]
|
|
lea r7, [r7+strideq*2]
|
|
vinserti32x8 m24, [r7+strideq*0+ 0], 1
|
|
vinserti32x8 m25, [r7+strideq*0+16], 1
|
|
mova m7, m10
|
|
mova m8, m10
|
|
pshufb m0, m24, m20
|
|
vpdpwssd m7, m12, m0 ; h0
|
|
mova m22, m11
|
|
pshufb m0, m25, m20
|
|
vpdpwssd m8, m14, m0 ; i2
|
|
mova m23, m11
|
|
vpdpwssd m22, m16, m1 ; A0
|
|
mova m1, m3
|
|
vpdpwssd m23, m16, m2 ; B0
|
|
mova m2, m4
|
|
pshufb m0, m24, m21
|
|
vpdpwssd m7, m13, m0 ; h1
|
|
pshufb m0, m25, m21
|
|
vpdpwssd m8, m15, m0 ; i3
|
|
vpdpwssd m22, m17, m3 ; A1
|
|
mova m3, m5
|
|
vpdpwssd m23, m17, m4 ; B1
|
|
mova m4, m6
|
|
shufpd m24, m25, 0x55
|
|
pshufb m25, m24, m20
|
|
vpdpwssd m7, m14, m25 ; h2
|
|
vpdpwssd m8, m12, m25 ; i0
|
|
vpdpwssd m22, m18, m5 ; A2
|
|
vpdpwssd m23, m18, m6 ; B2
|
|
pshufb m24, m21
|
|
vpdpwssd m7, m15, m24 ; h3
|
|
vpdpwssd m8, m13, m24 ; i1
|
|
vpermt2b m7, m9, m8 ; 78
|
|
vpshrdd m5, m6, m7, 16 ; 67
|
|
vpdpwssd m22, m19, m5 ; A3
|
|
vpdpwssd m23, m19, m7 ; B3
|
|
mova m6, m7
|
|
vpermt2b m22, m26, m23
|
|
mova [r8+wq*0], ym22
|
|
vextracti32x8 [r8+wq*1], m22, 1
|
|
lea r8, [r8+wq*2]
|
|
sub hd, 2
|
|
jg .hv_w16_loop
|
|
add srcq, 32
|
|
add tmpq, 32
|
|
movzx hd, r5b
|
|
sub r5d, 1<<8
|
|
jg .hv_w16_loop0
|
|
%if WIN64
|
|
pop r8
|
|
%endif
|
|
RET
|
|
|
|
%if WIN64
|
|
DECLARE_REG_TMP 5
|
|
%else
|
|
DECLARE_REG_TMP 7
|
|
%endif
|
|
|
|
cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts
|
|
%define base r6-pd_0to7
|
|
mov t0d, r7m
|
|
lea r6, [pd_0to7]
|
|
shr t0d, 11
|
|
vpbroadcastd m8, [base+warp_8x8t_rnd_v]
|
|
vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
|
|
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main
|
|
psrad m14, m16, 15
|
|
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
|
|
psrad m16, 15
|
|
packssdw m14, m16
|
|
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
|
|
psrad m15, m16, 15
|
|
call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
|
|
add tsq, tsq
|
|
psrad m16, 15
|
|
packssdw m15, m16
|
|
jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end
|
|
|
|
cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd
|
|
mov t0d, r7m ; pixel_max
|
|
lea r6, [pd_0to7]
|
|
shr t0d, 11
|
|
vpbroadcastd m1, [base+warp_8x8_rnd_h+t0*4]
|
|
vpbroadcastd m8, [base+warp_8x8_rnd_v+t0*4]
|
|
call .main
|
|
psrad m14, m16, 13
|
|
call .main2
|
|
psrad m16, 13
|
|
packusdw m14, m16
|
|
call .main2
|
|
psrad m15, m16, 13
|
|
call .main2
|
|
vpbroadcastd m0, [base+bidir_shift+t0*4]
|
|
vpsrlvw m14, m0
|
|
psrad m16, 13
|
|
packusdw m15, m16
|
|
vpsrlvw m15, m0
|
|
.end:
|
|
mova m0, [base+warp8x8_end]
|
|
vpermb m16, m0, m14
|
|
lea r2, [dsq*3]
|
|
mova [dstq+dsq*0], xm16
|
|
vextracti128 [dstq+dsq*1], ym16, 1
|
|
vextracti32x4 [dstq+dsq*2], m16, 2
|
|
vextracti32x4 [dstq+r2 ], m16, 3
|
|
vpermb m16, m0, m15
|
|
lea dstq, [dstq+dsq*4]
|
|
mova [dstq+dsq*0], xm16
|
|
vextracti128 [dstq+dsq*1], ym16, 1
|
|
vextracti32x4 [dstq+dsq*2], m16, 2
|
|
vextracti32x4 [dstq+r2 ], m16, 3
|
|
RET
|
|
.main:
|
|
vpbroadcastd ym3, [base+pd_512]
|
|
%if WIN64
|
|
mov abcdq, r5mp
|
|
vpaddd ym18, ym3, r6m {1to8} ; mx
|
|
%else
|
|
add r5d, 512
|
|
vpbroadcastd ym18, r5d
|
|
%endif
|
|
vpaddd ym20, ym3, r7m {1to8} ; my
|
|
mova ym16, [base+pd_0to7]
|
|
vpbroadcastd ym19, [abcdq+4*0] ; alpha
|
|
vpbroadcastd ym21, [abcdq+4*1] ; gamma
|
|
lea r4, [ssq*3+6]
|
|
vpdpwssd ym18, ym19, ym16 ; tmx
|
|
vpdpwssd ym20, ym21, ym16 ; tmy
|
|
sub srcq, r4
|
|
mova m10, [base+warp8x8_permA]
|
|
lea r4, [mc_warp_filter+64*8]
|
|
vbroadcasti32x4 m12, [base+warp8x8_permC]
|
|
kxnorb k1, k1, k1
|
|
vbroadcasti32x4 m13, [base+warp8x8_permD]
|
|
movu ym5, [srcq+0]
|
|
vinserti32x8 m5, [srcq+8], 1
|
|
psrad ym17, ym18, 10
|
|
mova m11, [base+warp8x8_permB]
|
|
kmovb k2, k1
|
|
vpgatherdq m3{k1}, [r4+ym17*8] ; filter_x0
|
|
psrad ym19, 16 ; beta
|
|
psrad ym21, 16 ; delta
|
|
paddd ym18, ym19
|
|
vpermb m4, m10, m5
|
|
vpbroadcastq m9, [base+warp_shift_h+t0*8]
|
|
pshufd m3, m3, q3120
|
|
paddd m7, m1, m1
|
|
pshufb m2, m3, m12
|
|
vpdpwssd m1, m4, m2
|
|
vpermb m5, m11, m5
|
|
vshufi32x4 m4, m5, q1021
|
|
pshufb m3, m13
|
|
vpdpwssd m1, m4, m3
|
|
call .h
|
|
psllq m2, m1, 32
|
|
paddd m1, m2
|
|
vpmultishiftqb m1, m9, m1
|
|
vpshrdq m1, m0, 48 ; 01 12
|
|
call .h
|
|
vpshrdq m2, m1, m0, 48 ; 23 34
|
|
call .h
|
|
vpshrdq m3, m2, m0, 48 ; 45 56
|
|
.main2:
|
|
call .h
|
|
psrad ym6, ym20, 10
|
|
kmovb k1, k2
|
|
paddd ym17, ym20, ym21 ; my += delta
|
|
vpgatherdq m20{k2}, [r4+ym6*8] ; filter_y0
|
|
psrad ym16, ym17, 10
|
|
kmovb k2, k1
|
|
vpgatherdq m6{k1}, [r4+ym16*8] ; filter_y1
|
|
shufps m5, m20, m6, q2020
|
|
mova m16, m8
|
|
pshufb m4, m5, m12
|
|
vpdpwssd m16, m1, m4 ; a0 b0
|
|
pshufb m5, m13
|
|
mova m1, m2
|
|
vpdpwssd m16, m2, m5 ; a1 b1
|
|
shufps m6, m20, m6, q3131
|
|
paddd ym20, ym17, ym21
|
|
pshufb m4, m6, m12
|
|
mova m2, m3
|
|
vpdpwssd m16, m3, m4 ; a2 b2
|
|
vpshrdq m3, m0, 48 ; 67 78
|
|
pshufb m6, m13
|
|
vpdpwssd m16, m3, m6 ; a3 b3
|
|
ret
|
|
ALIGN function_align
|
|
.h:
|
|
movu ym16, [srcq+ssq*1]
|
|
psrad ym6, ym18, 10
|
|
lea srcq, [srcq+ssq*2]
|
|
vinserti32x8 m5, m16, [srcq+ssq*0], 1
|
|
kmovb k1, k2
|
|
paddd ym17, ym18, ym19 ; mx += beta
|
|
vpgatherdq m18{k2}, [r4+ym6*8] ; filter_x1
|
|
psrad ym16, ym17, 10
|
|
kmovb k2, k1
|
|
vpgatherdq m6{k1}, [r4+ym16*8] ; filter_x2
|
|
vpermb m4, m10, m5
|
|
shufps m16, m18, m6, q2020
|
|
shufps m6, m18, m6, q3131
|
|
mova m0, m7
|
|
pshufb m18, m16, m12
|
|
vpdpwssd m0, m4, m18 ; a0 b0
|
|
vpermb m5, m11, m5
|
|
pshufb m18, m6, m13
|
|
vpdpwssd m0, m5, m18 ; a3 b3
|
|
paddd ym18, ym17, ym19
|
|
vshufi32x4 m17, m4, m5, q1021
|
|
pshufb m16, m13
|
|
vpdpwssd m0, m17, m16 ; a1 b1
|
|
vshufi32x4 m4, m5, q2132
|
|
pshufb m6, m12
|
|
vpdpwssd m0, m4, m6 ; a2 b2
|
|
vpmultishiftqb m0, m9, m0 ; a a b b
|
|
ret
|
|
|
|
%macro BIDIR_FN 0
|
|
call .main
|
|
lea stride3q, [strideq*3]
|
|
jmp wq
|
|
.w4:
|
|
movq [dstq ], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
vextracti32x4 xm2, ym0, 1
|
|
movq [dstq+strideq*2], xm2
|
|
movhps [dstq+stride3q ], xm2
|
|
cmp hd, 8
|
|
jl .w4_end
|
|
vextracti32x4 xm2, m0, 2
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq ], xm2
|
|
movhps [dstq+strideq*1], xm2
|
|
vextracti32x4 xm0, m0, 3
|
|
movq [dstq+strideq*2], xm0
|
|
movhps [dstq+stride3q ], xm0
|
|
je .w4_end
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq ], xm1
|
|
movhps [dstq+strideq*1], xm1
|
|
vextracti32x4 xm0, ym1, 1
|
|
movq [dstq+strideq*2], xm0
|
|
movhps [dstq+stride3q ], xm0
|
|
vextracti32x4 xm0, m1, 2
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq ], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
vextracti32x4 xm1, m1, 3
|
|
movq [dstq+strideq*2], xm1
|
|
movhps [dstq+stride3q ], xm1
|
|
.w4_end:
|
|
RET
|
|
.w8_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*4]
|
|
.w8:
|
|
mova [dstq+strideq*0], xm0
|
|
vextracti32x4 [dstq+strideq*1], ym0, 1
|
|
vextracti32x4 [dstq+strideq*2], m0, 2
|
|
vextracti32x4 [dstq+stride3q ], m0, 3
|
|
sub hd, 8
|
|
jl .w8_end
|
|
lea dstq, [dstq+strideq*4]
|
|
mova [dstq+strideq*0], xm1
|
|
vextracti32x4 [dstq+strideq*1], ym1, 1
|
|
vextracti32x4 [dstq+strideq*2], m1, 2
|
|
vextracti32x4 [dstq+stride3q ], m1, 3
|
|
jg .w8_loop
|
|
.w8_end:
|
|
RET
|
|
.w16_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*4]
|
|
.w16:
|
|
mova [dstq+strideq*0], ym0
|
|
vextracti32x8 [dstq+strideq*1], m0, 1
|
|
mova [dstq+strideq*2], ym1
|
|
vextracti32x8 [dstq+stride3q ], m1, 1
|
|
sub hd, 4
|
|
jg .w16_loop
|
|
RET
|
|
.w32_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
.w32:
|
|
mova [dstq+strideq*0], m0
|
|
mova [dstq+strideq*1], m1
|
|
sub hd, 2
|
|
jg .w32_loop
|
|
RET
|
|
.w64_loop:
|
|
call .main
|
|
add dstq, strideq
|
|
.w64:
|
|
mova [dstq+64*0], m0
|
|
mova [dstq+64*1], m1
|
|
dec hd
|
|
jg .w64_loop
|
|
RET
|
|
.w128_loop:
|
|
call .main
|
|
add dstq, strideq
|
|
.w128:
|
|
mova [dstq+64*0], m0
|
|
mova [dstq+64*1], m1
|
|
call .main
|
|
mova [dstq+64*2], m0
|
|
mova [dstq+64*3], m1
|
|
dec hd
|
|
jg .w128_loop
|
|
RET
|
|
%endmacro
|
|
|
|
%if WIN64
|
|
DECLARE_REG_TMP 5
|
|
%else
|
|
DECLARE_REG_TMP 7
|
|
%endif
|
|
|
|
cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3
|
|
%define base r6-avg_avx512icl_table
|
|
lea r6, [avg_avx512icl_table]
|
|
tzcnt wd, wm
|
|
mov t0d, r6m ; pixel_max
|
|
movsxd wq, [r6+wq*4]
|
|
shr t0d, 11
|
|
vpbroadcastd m2, [base+avg_round+t0*4]
|
|
vpbroadcastd m3, [base+avg_shift+t0*4]
|
|
movifnidn hd, hm
|
|
add wq, r6
|
|
BIDIR_FN
|
|
ALIGN function_align
|
|
.main:
|
|
mova m0, [tmp1q+64*0]
|
|
paddsw m0, [tmp2q+64*0]
|
|
mova m1, [tmp1q+64*1]
|
|
paddsw m1, [tmp2q+64*1]
|
|
add tmp1q, 64*2
|
|
add tmp2q, 64*2
|
|
pmaxsw m0, m2
|
|
pmaxsw m1, m2
|
|
psubsw m0, m2
|
|
psubsw m1, m2
|
|
vpsrlvw m0, m3
|
|
vpsrlvw m1, m3
|
|
ret
|
|
|
|
cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3
|
|
%define base r6-w_avg_avx512icl_table
|
|
lea r6, [w_avg_avx512icl_table]
|
|
tzcnt wd, wm
|
|
mov t0d, r7m ; pixel_max
|
|
shr t0d, 11
|
|
movsxd wq, [r6+wq*4]
|
|
vpbroadcastd m5, [base+w_avg_round+t0*4]
|
|
vpbroadcastd m7, [base+bidir_shift+t0*4]
|
|
add wq, r6
|
|
mov r6d, r6m ; weight
|
|
lea t0d, [r6-16]
|
|
shl r6d, 16
|
|
sub r6d, t0d ; 16-weight, weight
|
|
movifnidn hd, hm
|
|
vpbroadcastd m6, r6d
|
|
BIDIR_FN
|
|
ALIGN function_align
|
|
.main:
|
|
mova m3, [tmp1q+64*0]
|
|
mova m1, [tmp2q+64*0]
|
|
mova m0, [tmp1q+64*1]
|
|
mova m4, [tmp2q+64*1]
|
|
add tmp1q, 64*2
|
|
add tmp2q, 64*2
|
|
punpcklwd m2, m1, m3
|
|
punpckhwd m1, m3
|
|
punpcklwd m3, m4, m0
|
|
punpckhwd m4, m0
|
|
mova m0, m5
|
|
vpdpwssd m0, m6, m2
|
|
mova m2, m5
|
|
vpdpwssd m2, m6, m1
|
|
mova m1, m5
|
|
vpdpwssd m1, m6, m3
|
|
mova m3, m5
|
|
vpdpwssd m3, m6, m4
|
|
REPX {psrad x, 2}, m0, m2, m1, m3
|
|
packusdw m0, m2
|
|
packusdw m1, m3
|
|
vpsrlvw m0, m7
|
|
vpsrlvw m1, m7
|
|
ret
|
|
|
|
cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
|
%define base r7-mask_avx512icl_table
|
|
lea r7, [mask_avx512icl_table]
|
|
tzcnt wd, wm
|
|
mov r6d, r7m ; pixel_max
|
|
movifnidn hd, hm
|
|
shr r6d, 11
|
|
movsxd wq, [r7+wq*4]
|
|
vpbroadcastd m8, [base+pw_64]
|
|
vpbroadcastd m9, [base+mask_round+r6*4]
|
|
vpbroadcastd m10, [base+bidir_shift+r6*4]
|
|
mov maskq, maskmp
|
|
add wq, r7
|
|
BIDIR_FN
|
|
ALIGN function_align
|
|
.main:
|
|
pmovzxbw m1, [maskq+32*0]
|
|
mova m4, [tmp1q+64*0]
|
|
mova m2, [tmp2q+64*0]
|
|
pmovzxbw m6, [maskq+32*1]
|
|
mova m5, [tmp1q+64*1]
|
|
mova m3, [tmp2q+64*1]
|
|
add maskq, 32*2
|
|
add tmp1q, 64*2
|
|
add tmp2q, 64*2
|
|
punpcklwd m7, m4, m2
|
|
punpckhwd m4, m2
|
|
psubw m0, m8, m1
|
|
punpcklwd m2, m1, m0 ; m, 64-m
|
|
punpckhwd m1, m0
|
|
mova m0, m9
|
|
vpdpwssd m0, m7, m2
|
|
mova m2, m9
|
|
vpdpwssd m2, m4, m1 ; tmp1 * m + tmp2 * (64-m)
|
|
punpcklwd m7, m5, m3
|
|
punpckhwd m5, m3
|
|
psubw m1, m8, m6
|
|
punpcklwd m3, m6, m1
|
|
punpckhwd m6, m1
|
|
mova m1, m9
|
|
vpdpwssd m1, m7, m3
|
|
mova m3, m9
|
|
vpdpwssd m3, m5, m6
|
|
REPX {psrad x, 4}, m0, m2, m1, m3
|
|
packusdw m0, m2
|
|
packusdw m1, m3
|
|
vpsrlvw m0, m10
|
|
vpsrlvw m1, m10
|
|
ret
|
|
|
|
cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
|
%define base r7-w_mask_420_avx512icl_table
|
|
lea r7, [w_mask_420_avx512icl_table]
|
|
tzcnt wd, wm
|
|
mov r6d, r8m ; pixel_max
|
|
movifnidn hd, hm
|
|
shr r6d, 11
|
|
movsxd wq, [r7+wq*4]
|
|
vpbroadcastd m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
|
|
vpbroadcastd m11, [base+pw_64]
|
|
vpbroadcastd m12, [base+mask_round+r6*4]
|
|
vpbroadcastd m13, [base+bidir_shift+r6*4]
|
|
mov r6d, r7m ; sign
|
|
vpbroadcastd m14, [base+w_mask_round+r6*4]
|
|
mova ym15, [w_mask_end42x]
|
|
mov maskq, maskmp
|
|
add wq, r7
|
|
call .main
|
|
lea stride3q, [strideq*3]
|
|
jmp wq
|
|
.w4:
|
|
mova m4, [w_mask_shuf4]
|
|
vpermt2b m2, m4, m3
|
|
mova m3, m14
|
|
vpdpbusd m3, m2, [pb_64] {1to16}
|
|
vpermb m3, m15, m3
|
|
movq [dstq+strideq*0], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
vextracti32x4 xm2, ym0, 1
|
|
movq [dstq+strideq*2], xm2
|
|
movhps [dstq+stride3q ], xm2
|
|
mova [maskq], xm3
|
|
cmp hd, 8
|
|
jl .w4_end
|
|
vextracti32x4 xm2, m0, 2
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq+strideq*0], xm2
|
|
movhps [dstq+strideq*1], xm2
|
|
vextracti32x4 xm0, m0, 3
|
|
movq [dstq+strideq*2], xm0
|
|
movhps [dstq+stride3q ], xm0
|
|
je .w4_end
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq+strideq*0], xm1
|
|
movhps [dstq+strideq*1], xm1
|
|
vextracti32x4 xm2, ym1, 1
|
|
movq [dstq+strideq*2], xm2
|
|
movhps [dstq+stride3q ], xm2
|
|
vextracti32x4 xm2, m1, 2
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq+strideq*0], xm2
|
|
movhps [dstq+strideq*1], xm2
|
|
vextracti32x4 xm1, m1, 3
|
|
movq [dstq+strideq*2], xm1
|
|
movhps [dstq+stride3q ], xm1
|
|
.w4_end:
|
|
RET
|
|
.w8:
|
|
mova m8, [w_mask_shuf8]
|
|
vpbroadcastd m9, [pb_64]
|
|
jmp .w8_start
|
|
.w8_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*4]
|
|
add maskq, 16
|
|
.w8_start:
|
|
vpermt2b m2, m8, m3
|
|
mova m3, m14
|
|
vpdpbusd m3, m2, m9
|
|
vpermb m3, m15, m3
|
|
mova [dstq+strideq*0], xm0
|
|
vextracti32x4 [dstq+strideq*1], ym0, 1
|
|
vextracti32x4 [dstq+strideq*2], m0, 2
|
|
vextracti32x4 [dstq+stride3q ], m0, 3
|
|
mova [maskq], xm3
|
|
sub hd, 8
|
|
jl .w8_end
|
|
lea dstq, [dstq+strideq*4]
|
|
mova [dstq+strideq*0], xm1
|
|
vextracti32x4 [dstq+strideq*1], ym1, 1
|
|
vextracti32x4 [dstq+strideq*2], m1, 2
|
|
vextracti32x4 [dstq+stride3q ], m1, 3
|
|
jg .w8_loop
|
|
.w8_end:
|
|
RET
|
|
.w16:
|
|
mova m8, [w_mask_shuf16]
|
|
vpbroadcastd m9, [pb_64]
|
|
jmp .w16_start
|
|
.w16_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*4]
|
|
add maskq, 16
|
|
.w16_start:
|
|
vpermt2b m2, m8, m3
|
|
mova m3, m14
|
|
vpdpbusd m3, m2, m9
|
|
vpermb m3, m15, m3
|
|
mova [dstq+strideq*0], ym0
|
|
vextracti32x8 [dstq+strideq*1], m0, 1
|
|
mova [dstq+strideq*2], ym1
|
|
vextracti32x8 [dstq+stride3q ], m1, 1
|
|
mova [maskq], xm3
|
|
sub hd, 4
|
|
jg .w16_loop
|
|
RET
|
|
.w32_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*4]
|
|
add maskq, 32
|
|
.w32:
|
|
paddw m2, m3
|
|
mova m8, m14
|
|
vpdpwssd m8, m11, m2
|
|
mova [dstq+strideq*0], m0
|
|
mova [dstq+strideq*1], m1
|
|
call .main
|
|
paddw m2, m3
|
|
mova m3, m14
|
|
vpdpwssd m3, m11, m2
|
|
vpermt2b m8, m15, m3
|
|
mova [dstq+strideq*2], m0
|
|
mova [dstq+stride3q ], m1
|
|
mova [maskq], ym8
|
|
sub hd, 4
|
|
jg .w32_loop
|
|
RET
|
|
.w64_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
add maskq, 32
|
|
.w64:
|
|
mova m8, m2
|
|
mova m9, m3
|
|
mova [dstq+strideq*0+64*0], m0
|
|
mova [dstq+strideq*0+64*1], m1
|
|
call .main
|
|
paddw m8, m2
|
|
paddw m9, m3
|
|
mova m2, m14
|
|
vpdpwssd m2, m11, m8
|
|
mova m3, m14
|
|
vpdpwssd m3, m11, m9
|
|
vpermt2b m2, m15, m3
|
|
mova [dstq+strideq*1+64*0], m0
|
|
mova [dstq+strideq*1+64*1], m1
|
|
mova [maskq], ym2
|
|
sub hd, 2
|
|
jg .w64_loop
|
|
RET
|
|
.w128_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
add maskq, 64
|
|
.w128:
|
|
mova m16, m2
|
|
mova m8, m3
|
|
mova [dstq+strideq*0+64*0], m0
|
|
mova [dstq+strideq*0+64*1], m1
|
|
call .main
|
|
mova m17, m2
|
|
mova m9, m3
|
|
mova [dstq+strideq*0+64*2], m0
|
|
mova [dstq+strideq*0+64*3], m1
|
|
call .main
|
|
paddw m2, m16
|
|
paddw m3, m8
|
|
mova m16, m14
|
|
vpdpwssd m16, m11, m2
|
|
mova m8, m14
|
|
vpdpwssd m8, m11, m3
|
|
mova [dstq+strideq*1+64*0], m0
|
|
mova [dstq+strideq*1+64*1], m1
|
|
call .main
|
|
paddw m2, m17
|
|
paddw m3, m9
|
|
mova m17, m14
|
|
vpdpwssd m17, m11, m2
|
|
mova m9, m14
|
|
vpdpwssd m9, m11, m3
|
|
vpermt2b m16, m15, m8
|
|
vpermt2b m17, m15, m9
|
|
mova [dstq+strideq*1+64*2], m0
|
|
mova [dstq+strideq*1+64*3], m1
|
|
mova [maskq+32*0], ym16
|
|
mova [maskq+32*1], ym17
|
|
sub hd, 2
|
|
jg .w128_loop
|
|
vzeroupper
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
mova m1, [tmp1q+64*0]
|
|
mova m3, [tmp2q+64*0]
|
|
mova m4, [tmp1q+64*1]
|
|
mova m7, [tmp2q+64*1]
|
|
add tmp1q, 64*2
|
|
add tmp2q, 64*2
|
|
psubsw m6, m1, m3
|
|
punpcklwd m5, m3, m1
|
|
pabsw m6, m6
|
|
punpckhwd m3, m1
|
|
psubusw m6, m10, m6
|
|
psrlw m6, 10 ; 64-m
|
|
psubw m2, m11, m6 ; m
|
|
punpcklwd m1, m6, m2
|
|
punpckhwd m6, m2
|
|
mova m0, m12
|
|
vpdpwssd m0, m5, m1
|
|
mova m1, m12
|
|
vpdpwssd m1, m3, m6
|
|
psubsw m5, m4, m7
|
|
punpcklwd m6, m7, m4
|
|
pabsw m5, m5
|
|
punpckhwd m7, m4
|
|
psubusw m5, m10, m5
|
|
psrlw m5, 10
|
|
psubw m3, m11, m5
|
|
punpcklwd m4, m5, m3
|
|
psrad m0, 4
|
|
punpckhwd m5, m3
|
|
psrad m1, 4
|
|
packusdw m0, m1
|
|
mova m1, m12
|
|
vpdpwssd m1, m6, m4
|
|
mova m4, m12
|
|
vpdpwssd m4, m7, m5
|
|
psrad m1, 4
|
|
psrad m4, 4
|
|
packusdw m1, m4
|
|
vpsrlvw m0, m13
|
|
vpsrlvw m1, m13
|
|
ret
|
|
|
|
cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
|
%define base r7-w_mask_422_avx512icl_table
|
|
lea r7, [w_mask_422_avx512icl_table]
|
|
tzcnt wd, wm
|
|
mov r6d, r8m ; pixel_max
|
|
movifnidn hd, hm
|
|
shr r6d, 11
|
|
movsxd wq, [r7+wq*4]
|
|
vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
|
|
vpbroadcastd m9, [base+pw_64]
|
|
vpbroadcastd m10, [base+mask_round+r6*4]
|
|
vpbroadcastd m11, [base+bidir_shift+r6*4]
|
|
mov r6d, r7m ; sign
|
|
vpbroadcastd m12, [base+w_mask_round+r6*4]
|
|
mova ym13, [w_mask_end42x]
|
|
mov maskq, maskmp
|
|
add wq, r7
|
|
paddw m14, m9, m9 ; pw_128
|
|
call .main
|
|
lea stride3q, [strideq*3]
|
|
jmp wq
|
|
.w4:
|
|
movq [dstq+strideq*0], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
vextracti32x4 xm2, ym0, 1
|
|
movq [dstq+strideq*2], xm2
|
|
movhps [dstq+stride3q ], xm2
|
|
cmp hd, 8
|
|
jl .w4_end
|
|
vextracti32x4 xm2, m0, 2
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq+strideq*0], xm2
|
|
movhps [dstq+strideq*1], xm2
|
|
vextracti32x4 xm0, m0, 3
|
|
movq [dstq+strideq*2], xm0
|
|
movhps [dstq+stride3q ], xm0
|
|
je .w4_end
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq+strideq*0], xm1
|
|
movhps [dstq+strideq*1], xm1
|
|
vextracti32x4 xm2, ym1, 1
|
|
movq [dstq+strideq*2], xm2
|
|
movhps [dstq+stride3q ], xm2
|
|
vextracti32x4 xm2, m1, 2
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq+strideq*0], xm2
|
|
movhps [dstq+strideq*1], xm2
|
|
vextracti32x4 xm1, m1, 3
|
|
movq [dstq+strideq*2], xm1
|
|
movhps [dstq+stride3q ], xm1
|
|
.w4_end:
|
|
RET
|
|
.w8_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*4]
|
|
.w8:
|
|
mova [dstq+strideq*0], xm0
|
|
vextracti32x4 [dstq+strideq*1], ym0, 1
|
|
vextracti32x4 [dstq+strideq*2], m0, 2
|
|
vextracti32x4 [dstq+stride3q ], m0, 3
|
|
sub hd, 8
|
|
jl .w8_end
|
|
lea dstq, [dstq+strideq*4]
|
|
mova [dstq+strideq*0], xm1
|
|
vextracti32x4 [dstq+strideq*1], ym1, 1
|
|
vextracti32x4 [dstq+strideq*2], m1, 2
|
|
vextracti32x4 [dstq+stride3q ], m1, 3
|
|
jg .w8_loop
|
|
.w8_end:
|
|
RET
|
|
.w16_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*4]
|
|
.w16:
|
|
mova [dstq+strideq*0], ym0
|
|
vextracti32x8 [dstq+strideq*1], m0, 1
|
|
mova [dstq+strideq*2], ym1
|
|
vextracti32x8 [dstq+stride3q ], m1, 1
|
|
sub hd, 4
|
|
jg .w16_loop
|
|
RET
|
|
.w32_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
.w32:
|
|
mova [dstq+strideq*0], m0
|
|
mova [dstq+strideq*1], m1
|
|
sub hd, 2
|
|
jg .w32_loop
|
|
RET
|
|
.w64_loop:
|
|
call .main
|
|
add dstq, strideq
|
|
.w64:
|
|
mova [dstq+64*0], m0
|
|
mova [dstq+64*1], m1
|
|
dec hd
|
|
jg .w64_loop
|
|
RET
|
|
.w128_loop:
|
|
call .main
|
|
add dstq, strideq
|
|
.w128:
|
|
mova [dstq+64*0], m0
|
|
mova [dstq+64*1], m1
|
|
call .main
|
|
mova [dstq+64*2], m0
|
|
mova [dstq+64*3], m1
|
|
dec hd
|
|
jg .w128_loop
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
mova m1, [tmp1q+64*0]
|
|
mova m3, [tmp2q+64*0]
|
|
mova m4, [tmp1q+64*1]
|
|
mova m7, [tmp2q+64*1]
|
|
add tmp1q, 64*2
|
|
add tmp2q, 64*2
|
|
psubsw m6, m1, m3
|
|
punpcklwd m5, m3, m1
|
|
pabsw m6, m6
|
|
punpckhwd m3, m1
|
|
psubusw m6, m8, m6
|
|
psrlw m6, 10
|
|
psubw m2, m9, m6
|
|
punpcklwd m1, m6, m2
|
|
punpckhwd m6, m2
|
|
mova m0, m10
|
|
vpdpwssd m0, m5, m1
|
|
mova m1, m10
|
|
vpdpwssd m1, m3, m6
|
|
psubsw m5, m4, m7
|
|
punpcklwd m6, m7, m4
|
|
pabsw m5, m5
|
|
punpckhwd m7, m4
|
|
psubusw m5, m8, m5
|
|
psrlw m5, 10
|
|
psubw m3, m9, m5
|
|
punpcklwd m4, m5, m3
|
|
psrad m0, 4
|
|
punpckhwd m5, m3
|
|
psrad m1, 4
|
|
packusdw m0, m1
|
|
mova m1, m10
|
|
vpdpwssd m1, m6, m4
|
|
mova m4, m10
|
|
vpdpwssd m4, m7, m5
|
|
mova m5, m12
|
|
vpdpwssd m5, m14, m2
|
|
mova m2, m12
|
|
vpdpwssd m2, m14, m3
|
|
psrad m1, 4
|
|
psrad m4, 4
|
|
packusdw m1, m4
|
|
vpermt2b m5, m13, m2
|
|
vpsrlvw m0, m11
|
|
vpsrlvw m1, m11
|
|
mova [maskq], ym5
|
|
add maskq, 32
|
|
ret
|
|
|
|
cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
|
|
%define base r7-w_mask_444_avx512icl_table
|
|
lea r7, [w_mask_444_avx512icl_table]
|
|
tzcnt wd, wm
|
|
mov r6d, r8m ; pixel_max
|
|
movifnidn hd, hm
|
|
shr r6d, 11
|
|
movsxd wq, [r7+wq*4]
|
|
vpbroadcastd m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
|
|
vpbroadcastd m9, [base+pw_64]
|
|
vpbroadcastd m10, [base+mask_round+r6*4]
|
|
mova m11, [w_mask_end444]
|
|
vpbroadcastd m12, [base+bidir_shift+r6*4]
|
|
mov maskq, maskmp
|
|
add wq, r7
|
|
call .main
|
|
lea stride3q, [strideq*3]
|
|
jmp wq
|
|
.w4:
|
|
movq [dstq+strideq*0], xm0
|
|
movhps [dstq+strideq*1], xm0
|
|
vextracti32x4 xm2, ym0, 1
|
|
movq [dstq+strideq*2], xm2
|
|
movhps [dstq+stride3q ], xm2
|
|
cmp hd, 8
|
|
jl .w4_end
|
|
vextracti32x4 xm2, m0, 2
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq+strideq*0], xm2
|
|
movhps [dstq+strideq*1], xm2
|
|
vextracti32x4 xm0, m0, 3
|
|
movq [dstq+strideq*2], xm0
|
|
movhps [dstq+stride3q ], xm0
|
|
je .w4_end
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq+strideq*0], xm1
|
|
movhps [dstq+strideq*1], xm1
|
|
vextracti32x4 xm2, ym1, 1
|
|
movq [dstq+strideq*2], xm2
|
|
movhps [dstq+stride3q ], xm2
|
|
vextracti32x4 xm2, m1, 2
|
|
lea dstq, [dstq+strideq*4]
|
|
movq [dstq+strideq*0], xm2
|
|
movhps [dstq+strideq*1], xm2
|
|
vextracti32x4 xm1, m1, 3
|
|
movq [dstq+strideq*2], xm1
|
|
movhps [dstq+stride3q ], xm1
|
|
.w4_end:
|
|
RET
|
|
.w8_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*4]
|
|
.w8:
|
|
mova [dstq+strideq*0], xm0
|
|
vextracti32x4 [dstq+strideq*1], ym0, 1
|
|
vextracti32x4 [dstq+strideq*2], m0, 2
|
|
vextracti32x4 [dstq+stride3q ], m0, 3
|
|
sub hd, 8
|
|
jl .w8_end
|
|
lea dstq, [dstq+strideq*4]
|
|
mova [dstq+strideq*0], xm1
|
|
vextracti32x4 [dstq+strideq*1], ym1, 1
|
|
vextracti32x4 [dstq+strideq*2], m1, 2
|
|
vextracti32x4 [dstq+stride3q ], m1, 3
|
|
jg .w8_loop
|
|
.w8_end:
|
|
RET
|
|
.w16_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*4]
|
|
.w16:
|
|
mova [dstq+strideq*0], ym0
|
|
vextracti32x8 [dstq+strideq*1], m0, 1
|
|
mova [dstq+strideq*2], ym1
|
|
vextracti32x8 [dstq+stride3q ], m1, 1
|
|
sub hd, 4
|
|
jg .w16_loop
|
|
RET
|
|
.w32_loop:
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
.w32:
|
|
mova [dstq+strideq*0], m0
|
|
mova [dstq+strideq*1], m1
|
|
sub hd, 2
|
|
jg .w32_loop
|
|
RET
|
|
.w64_loop:
|
|
call .main
|
|
add dstq, strideq
|
|
.w64:
|
|
mova [dstq+64*0], m0
|
|
mova [dstq+64*1], m1
|
|
dec hd
|
|
jg .w64_loop
|
|
RET
|
|
.w128_loop:
|
|
call .main
|
|
add dstq, strideq
|
|
.w128:
|
|
mova [dstq+64*0], m0
|
|
mova [dstq+64*1], m1
|
|
call .main
|
|
mova [dstq+64*2], m0
|
|
mova [dstq+64*3], m1
|
|
dec hd
|
|
jg .w128_loop
|
|
RET
|
|
ALIGN function_align
|
|
.main:
|
|
mova m1, [tmp1q+64*0]
|
|
mova m3, [tmp2q+64*0]
|
|
mova m4, [tmp1q+64*1]
|
|
mova m7, [tmp2q+64*1]
|
|
add tmp1q, 64*2
|
|
add tmp2q, 64*2
|
|
psubsw m6, m1, m3
|
|
punpcklwd m5, m3, m1
|
|
pabsw m6, m6
|
|
punpckhwd m3, m1
|
|
psubusw m6, m8, m6
|
|
psrlw m6, 10
|
|
psubw m2, m9, m6
|
|
punpcklwd m1, m6, m2
|
|
punpckhwd m6, m2
|
|
mova m0, m10
|
|
vpdpwssd m0, m5, m1
|
|
mova m1, m10
|
|
vpdpwssd m1, m3, m6
|
|
psubsw m5, m4, m7
|
|
punpcklwd m6, m7, m4
|
|
pabsw m5, m5
|
|
punpckhwd m7, m4
|
|
psubusw m5, m8, m5
|
|
psrlw m5, 10
|
|
psubw m3, m9, m5
|
|
punpcklwd m4, m5, m3
|
|
psrad m0, 4
|
|
punpckhwd m5, m3
|
|
psrad m1, 4
|
|
packusdw m0, m1
|
|
mova m1, m10
|
|
vpdpwssd m1, m6, m4
|
|
mova m4, m10
|
|
vpdpwssd m4, m7, m5
|
|
vpermt2b m2, m11, m3
|
|
psrad m1, 4
|
|
psrad m4, 4
|
|
packusdw m1, m4
|
|
vpsrlvw m0, m12
|
|
vpsrlvw m1, m12
|
|
mova [maskq], m2
|
|
add maskq, 64
|
|
ret
|
|
|
|
cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
|
|
%define base r6-blend_avx512icl_table
|
|
lea r6, [blend_avx512icl_table]
|
|
tzcnt wd, wm
|
|
movifnidn hd, hm
|
|
movsxd wq, [r6+wq*4]
|
|
movifnidn maskq, maskmp
|
|
vpbroadcastd m6, [base+pw_m512]
|
|
add wq, r6
|
|
lea r6, [dsq*3]
|
|
jmp wq
|
|
.w4:
|
|
pmovzxbw ym19, [maskq]
|
|
movq xm16, [dstq+dsq*0]
|
|
movhps xm16, [dstq+dsq*1]
|
|
vpbroadcastq ym17, [dstq+dsq*2]
|
|
vpbroadcastq ym18, [dstq+r6 ]
|
|
pmullw ym19, ym6
|
|
vpblendd ym16, ym17, 0x30
|
|
vpblendd ym16, ym18, 0xc0
|
|
psubw ym17, ym16, [tmpq]
|
|
add maskq, 16
|
|
add tmpq, 32
|
|
pmulhrsw ym17, ym19
|
|
paddw ym16, ym17
|
|
vextracti128 xm17, ym16, 1
|
|
movq [dstq+dsq*0], xm16
|
|
movhps [dstq+dsq*1], xm16
|
|
movq [dstq+dsq*2], xm17
|
|
movhps [dstq+r6 ], xm17
|
|
lea dstq, [dstq+dsq*4]
|
|
sub hd, 4
|
|
jg .w4
|
|
vzeroupper
|
|
RET
|
|
.w8:
|
|
pmovzxbw m2, [maskq]
|
|
mova xm0, [dstq+dsq*0]
|
|
vinserti32x4 ym0, [dstq+dsq*1], 1
|
|
vinserti32x4 m0, [dstq+dsq*2], 2
|
|
vinserti32x4 m0, [dstq+r6 ], 3
|
|
pmullw m2, m6
|
|
psubw m1, m0, [tmpq]
|
|
add maskq, 32
|
|
add tmpq, 64
|
|
pmulhrsw m1, m2
|
|
paddw m0, m1
|
|
mova [dstq+dsq*0], xm0
|
|
vextracti32x4 [dstq+dsq*1], ym0, 1
|
|
vextracti32x4 [dstq+dsq*2], m0, 2
|
|
vextracti32x4 [dstq+r6 ], m0, 3
|
|
lea dstq, [dstq+dsq*4]
|
|
sub hd, 4
|
|
jg .w8
|
|
RET
|
|
.w16:
|
|
pmovzxbw m4, [maskq+32*0]
|
|
pmovzxbw m5, [maskq+32*1]
|
|
mova ym0, [dstq+dsq*0]
|
|
vinserti32x8 m0, [dstq+dsq*1], 1
|
|
mova ym1, [dstq+dsq*2]
|
|
vinserti32x8 m1, [dstq+r6 ], 1
|
|
pmullw m4, m6
|
|
pmullw m5, m6
|
|
psubw m2, m0, [tmpq+64*0]
|
|
psubw m3, m1, [tmpq+64*1]
|
|
add maskq, 32*2
|
|
add tmpq, 64*2
|
|
pmulhrsw m2, m4
|
|
pmulhrsw m3, m5
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
mova [dstq+dsq*0], ym0
|
|
vextracti32x8 [dstq+dsq*1], m0, 1
|
|
mova [dstq+dsq*2], ym1
|
|
vextracti32x8 [dstq+r6 ], m1, 1
|
|
lea dstq, [dstq+dsq*4]
|
|
sub hd, 4
|
|
jg .w16
|
|
RET
|
|
.w32:
|
|
pmovzxbw m4, [maskq+32*0]
|
|
pmovzxbw m5, [maskq+32*1]
|
|
mova m0, [dstq+dsq*0]
|
|
mova m1, [dstq+dsq*1]
|
|
pmullw m4, m6
|
|
pmullw m5, m6
|
|
psubw m2, m0, [tmpq+ 64*0]
|
|
psubw m3, m1, [tmpq+ 64*1]
|
|
add maskq, 32*2
|
|
add tmpq, 64*2
|
|
pmulhrsw m2, m4
|
|
pmulhrsw m3, m5
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
mova [dstq+dsq*0], m0
|
|
mova [dstq+dsq*1], m1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .w32
|
|
RET
|
|
|
|
cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
|
|
lea r5, [blend_v_avx512icl_table]
|
|
tzcnt wd, wm
|
|
movifnidn hd, hm
|
|
movsxd wq, [r5+wq*4]
|
|
add wq, r5
|
|
jmp wq
|
|
.w2:
|
|
vpbroadcastd xmm2, [obmc_masks_avx2+2*2]
|
|
.w2_loop:
|
|
movd xmm0, [dstq+dsq*0]
|
|
pinsrd xmm0, [dstq+dsq*1], 1
|
|
movq xmm1, [tmpq]
|
|
add tmpq, 4*2
|
|
psubw xmm1, xmm0, xmm1
|
|
pmulhrsw xmm1, xmm2
|
|
paddw xmm0, xmm1
|
|
movd [dstq+dsq*0], xmm0
|
|
pextrd [dstq+dsq*1], xmm0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .w2_loop
|
|
RET
|
|
.w4:
|
|
vpbroadcastq xmm2, [obmc_masks_avx2+4*2]
|
|
.w4_loop:
|
|
movq xmm0, [dstq+dsq*0]
|
|
movhps xmm0, [dstq+dsq*1]
|
|
psubw xmm1, xmm0, [tmpq]
|
|
add tmpq, 8*2
|
|
pmulhrsw xmm1, xmm2
|
|
paddw xmm0, xmm1
|
|
movq [dstq+dsq*0], xmm0
|
|
movhps [dstq+dsq*1], xmm0
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .w4_loop
|
|
RET
|
|
.w8:
|
|
vbroadcasti32x4 ym2, [obmc_masks_avx2+8*2]
|
|
.w8_loop:
|
|
mova xm0, [dstq+dsq*0]
|
|
vinserti32x4 ym0, [dstq+dsq*1], 1
|
|
psubw ym1, ym0, [tmpq]
|
|
add tmpq, 16*2
|
|
pmulhrsw ym1, ym2
|
|
paddw ym0, ym1
|
|
mova [dstq+dsq*0], xm0
|
|
vextracti32x4 [dstq+dsq*1], ym0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .w8_loop
|
|
RET
|
|
.w16:
|
|
vbroadcasti32x8 m2, [obmc_masks_avx2+16*2]
|
|
.w16_loop:
|
|
mova ym0, [dstq+dsq*0]
|
|
vinserti32x8 m0, [dstq+dsq*1], 1
|
|
psubw m1, m0, [tmpq]
|
|
add tmpq, 32*2
|
|
pmulhrsw m1, m2
|
|
paddw m0, m1
|
|
mova [dstq+dsq*0], ym0
|
|
vextracti32x8 [dstq+dsq*1], m0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .w16_loop
|
|
RET
|
|
.w32:
|
|
mova m4, [obmc_masks_avx2+32*2]
|
|
.w32_loop:
|
|
mova m0, [dstq+dsq*0]
|
|
psubw m2, m0, [tmpq+ 64*0]
|
|
mova m1, [dstq+dsq*1]
|
|
psubw m3, m1, [tmpq+ 64*1]
|
|
add tmpq, 64*2
|
|
pmulhrsw m2, m4
|
|
pmulhrsw m3, m4
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
mova [dstq+dsq*0], m0
|
|
mova [dstq+dsq*1], m1
|
|
lea dstq, [dstq+dsq*2]
|
|
sub hd, 2
|
|
jg .w32_loop
|
|
RET
|
|
|
|
cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
|
|
%define base r6-$$
|
|
lea r6, [$$]
|
|
tzcnt wd, wm
|
|
mov hd, hm
|
|
movsxd wq, [base+blend_h_avx512icl_table+wq*4]
|
|
lea maskq, [base+obmc_masks_avx2+hq*2]
|
|
lea hd, [hq*3]
|
|
lea wq, [base+blend_h_avx512icl_table+wq]
|
|
shr hd, 2 ; h * 3/4
|
|
lea maskq, [maskq+hq*2]
|
|
neg hq
|
|
jmp wq
|
|
.w2:
|
|
movd xmm0, [dstq+dsq*0]
|
|
pinsrd xmm0, [dstq+dsq*1], 1
|
|
movd xmm2, [maskq+hq*2]
|
|
movq xmm1, [tmpq]
|
|
add tmpq, 4*2
|
|
punpcklwd xmm2, xmm2
|
|
psubw xmm1, xmm0, xmm1
|
|
pmulhrsw xmm1, xmm2
|
|
paddw xmm0, xmm1
|
|
movd [dstq+dsq*0], xmm0
|
|
pextrd [dstq+dsq*1], xmm0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
add hq, 2
|
|
jl .w2
|
|
RET
|
|
.w4:
|
|
mova xmm3, [blend_shuf]
|
|
.w4_loop:
|
|
movq xmm0, [dstq+dsq*0]
|
|
movhps xmm0, [dstq+dsq*1]
|
|
movd xmm2, [maskq+hq*2]
|
|
psubw xmm1, xmm0, [tmpq]
|
|
add tmpq, 8*2
|
|
pshufb xmm2, xmm3
|
|
pmulhrsw xmm1, xmm2
|
|
paddw xmm0, xmm1
|
|
movq [dstq+dsq*0], xmm0
|
|
movhps [dstq+dsq*1], xmm0
|
|
lea dstq, [dstq+dsq*2]
|
|
add hq, 2
|
|
jl .w4_loop
|
|
RET
|
|
.w8:
|
|
vbroadcasti32x4 ym3, [blend_shuf]
|
|
shufpd ym3, ym3, 0x0c
|
|
.w8_loop:
|
|
mova xm0, [dstq+dsq*0]
|
|
vinserti32x4 ym0, [dstq+dsq*1], 1
|
|
vpbroadcastd ym2, [maskq+hq*2]
|
|
psubw ym1, ym0, [tmpq]
|
|
add tmpq, 16*2
|
|
pshufb ym2, ym3
|
|
pmulhrsw ym1, ym2
|
|
paddw ym0, ym1
|
|
mova [dstq+dsq*0], xm0
|
|
vextracti32x4 [dstq+dsq*1], ym0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
add hq, 2
|
|
jl .w8_loop
|
|
RET
|
|
.w16:
|
|
vbroadcasti32x4 m3, [blend_shuf]
|
|
shufpd m3, m3, 0xf0
|
|
.w16_loop:
|
|
mova ym0, [dstq+dsq*0]
|
|
vinserti32x8 m0, [dstq+dsq*1], 1
|
|
vpbroadcastd m2, [maskq+hq*2]
|
|
psubw m1, m0, [tmpq]
|
|
add tmpq, 32*2
|
|
pshufb m2, m3
|
|
pmulhrsw m1, m2
|
|
paddw m0, m1
|
|
mova [dstq+dsq*0], ym0
|
|
vextracti32x8 [dstq+dsq*1], m0, 1
|
|
lea dstq, [dstq+dsq*2]
|
|
add hq, 2
|
|
jl .w16_loop
|
|
RET
|
|
.w32:
|
|
vpbroadcastw m4, [maskq+hq*2]
|
|
vpbroadcastw m5, [maskq+hq*2+2]
|
|
mova m0, [dstq+dsq*0]
|
|
psubw m2, m0, [tmpq+ 64*0]
|
|
mova m1, [dstq+dsq*1]
|
|
psubw m3, m1, [tmpq+ 64*1]
|
|
add tmpq, 64*2
|
|
pmulhrsw m2, m4
|
|
pmulhrsw m3, m5
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
mova [dstq+dsq*0], m0
|
|
mova [dstq+dsq*1], m1
|
|
lea dstq, [dstq+dsq*2]
|
|
add hq, 2
|
|
jl .w32
|
|
RET
|
|
.w64:
|
|
vpbroadcastw m4, [maskq+hq*2]
|
|
mova m0, [dstq+64*0]
|
|
psubw m2, m0, [tmpq+64*0]
|
|
mova m1, [dstq+64*1]
|
|
psubw m3, m1, [tmpq+64*1]
|
|
add tmpq, 64*2
|
|
pmulhrsw m2, m4
|
|
pmulhrsw m3, m4
|
|
paddw m0, m2
|
|
paddw m1, m3
|
|
mova [dstq+64*0], m0
|
|
mova [dstq+64*1], m1
|
|
add dstq, dsq
|
|
inc hq
|
|
jl .w64
|
|
RET
|
|
.w128:
|
|
vpbroadcastw m8, [maskq+hq*2]
|
|
mova m0, [dstq+64*0]
|
|
psubw m4, m0, [tmpq+64*0]
|
|
mova m1, [dstq+64*1]
|
|
psubw m5, m1, [tmpq+64*1]
|
|
mova m2, [dstq+64*2]
|
|
psubw m6, m2, [tmpq+64*2]
|
|
mova m3, [dstq+64*3]
|
|
psubw m7, m3, [tmpq+64*3]
|
|
add tmpq, 64*4
|
|
REPX {pmulhrsw x, m8}, m4, m5, m6, m7
|
|
paddw m0, m4
|
|
paddw m1, m5
|
|
paddw m2, m6
|
|
paddw m3, m7
|
|
mova [dstq+64*0], m0
|
|
mova [dstq+64*1], m1
|
|
mova [dstq+64*2], m2
|
|
mova [dstq+64*3], m3
|
|
add dstq, dsq
|
|
inc hq
|
|
jl .w128
|
|
RET
|
|
|
|
cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
|
|
dst_w, h, src_w, dx, mx0, pxmax
|
|
sub dword mx0m, 4<<14
|
|
sub dword src_wm, 8
|
|
mov r6, ~0
|
|
vpbroadcastd m5, dxm
|
|
vpbroadcastd m8, mx0m
|
|
vpbroadcastd m6, src_wm
|
|
kmovq k6, r6
|
|
DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
|
|
LEA r7, $$
|
|
%define base r7-$$
|
|
vpbroadcastd m3, [base+pd_16384]
|
|
vpbroadcastd m7, [base+pd_63]
|
|
mova m24, [base+resize_permA]
|
|
mova m25, [base+resize_permB]
|
|
mova m26, [base+resize_permC]
|
|
mova m27, [base+resize_permD]
|
|
vbroadcasti32x4 m28, [base+resize_shufA]
|
|
vbroadcasti32x4 m29, [base+resize_shufB]
|
|
mova m30, [base+resize_permE]
|
|
vpbroadcastw ym31, pxmaxm
|
|
vpdpwssd m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
|
|
pslld m5, 4 ; dx*16
|
|
pslld m6, 14
|
|
pxor m2, m2
|
|
.loop_y:
|
|
xor xd, xd
|
|
mova m4, m8 ; per-line working version of mx
|
|
.loop_x:
|
|
pmaxsd m0, m4, m2
|
|
psrad m9, m4, 8 ; filter offset (unmasked)
|
|
pminsd m0, m6 ; iclip(mx, 0, src_w-8)
|
|
psubd m1, m4, m0 ; pshufb offset
|
|
psrad m0, 14 ; clipped src_x offset
|
|
psrad m1, 14 ; pshufb edge_emu offset
|
|
vptestmd k5, m1, m1
|
|
pand m9, m7 ; filter offset (masked)
|
|
ktestw k5, k5
|
|
jz .load
|
|
vpbroadcastq m14, [base+pd_0_4]
|
|
vpermq m10, m0, q1100
|
|
vpermq m11, m0, q3322
|
|
vpermq m20, m1, q1100
|
|
vpermq m21, m1, q3322
|
|
punpckldq m10, m10
|
|
punpckldq m11, m11
|
|
punpckldq m20, m20
|
|
punpckldq m21, m21
|
|
paddd m10, m14
|
|
paddd m11, m14
|
|
paddd m20, m14
|
|
paddd m21, m14
|
|
vextracti32x8 ym12, m10, 1
|
|
vextracti32x8 ym13, m11, 1
|
|
vextracti32x8 ym22, m20, 1
|
|
vextracti32x8 ym23, m21, 1
|
|
kmovq k1, k6
|
|
kmovq k2, k6
|
|
kmovq k3, k6
|
|
kmovq k4, k6
|
|
vpgatherdq m16{k1}, [srcq+ym10*2] ; 0 1 2 3
|
|
vpgatherdq m17{k2}, [srcq+ym11*2] ; 4 5 6 7
|
|
vpgatherdq m18{k3}, [srcq+ym12*2] ; 8 9 A B
|
|
vpgatherdq m19{k4}, [srcq+ym13*2] ; C D E F
|
|
kmovq k1, k6
|
|
kmovq k2, k6
|
|
kmovq k3, k6
|
|
kmovq k4, k6
|
|
vpgatherdq m0{k1}, [base+resize_shuf+8+ym20*2]
|
|
vpgatherdq m1{k2}, [base+resize_shuf+8+ym21*2]
|
|
vpgatherdq m14{k3}, [base+resize_shuf+8+ym22*2]
|
|
vpgatherdq m15{k4}, [base+resize_shuf+8+ym23*2]
|
|
pshufb m16, m0
|
|
pshufb m17, m1
|
|
pshufb m18, m14
|
|
pshufb m19, m15
|
|
mova m20, m24
|
|
mova m22, m24
|
|
mova m21, m25
|
|
mova m23, m25
|
|
vpermi2d m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
|
|
vpermi2d m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
|
|
vpermi2d m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
|
|
vpermi2d m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
|
|
mova m15, m26
|
|
mova m17, m26
|
|
mova m16, m27
|
|
mova m18, m27
|
|
vpermi2q m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
|
|
vpermi2q m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
|
|
vpermi2q m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
|
|
vpermi2q m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
|
|
kmovq k1, k6
|
|
kmovq k2, k6
|
|
vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
|
|
vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
|
|
pshufb m10, m11, m28
|
|
pshufb m11, m11, m29
|
|
pshufb m12, m13, m28
|
|
pshufb m13, m13, m29
|
|
jmp .filter
|
|
.load:
|
|
kmovq k1, k6
|
|
kmovq k2, k6
|
|
kmovq k3, k6
|
|
kmovq k4, k6
|
|
vpgatherdd m11{k1}, [base+resize_filter+m9*8+0]
|
|
vpgatherdd m13{k2}, [base+resize_filter+m9*8+4]
|
|
pshufb m10, m11, m28
|
|
pshufb m11, m11, m29
|
|
pshufb m12, m13, m28
|
|
pshufb m13, m13, m29
|
|
vpgatherdd m15{k3}, [srcq+m0*2+ 0]
|
|
vpgatherdd m16{k4}, [srcq+m0*2+ 4]
|
|
kmovq k1, k6
|
|
kmovq k2, k6
|
|
vpgatherdd m17{k1}, [srcq+m0*2+ 8]
|
|
vpgatherdd m18{k2}, [srcq+m0*2+12]
|
|
.filter:
|
|
mova m14, m2
|
|
vpdpwssd m14, m15, m10
|
|
vpdpwssd m14, m16, m11
|
|
vpdpwssd m14, m17, m12
|
|
vpdpwssd m14, m18, m13
|
|
psubd m14, m3, m14
|
|
psrad m14, 15
|
|
packusdw m14, m14
|
|
vpermq m14, m30, m14
|
|
pminsw ym14, ym31
|
|
mova [dstq+xq*2], ym14
|
|
paddd m4, m5
|
|
add xd, 16
|
|
cmp xd, dst_wd
|
|
jl .loop_x
|
|
add dstq, dst_strideq
|
|
add srcq, src_strideq
|
|
dec hd
|
|
jg .loop_y
|
|
RET
|
|
|
|
%endif ; ARCH_X86_64
|