@@ -710,6 +710,51 @@ endfunc
st1 {v2.8h}, [\src], \inc
.endm
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
+ srshr \coef0, \coef0, #6
+ ld1 {v2.8b}, [x0], x1
+ srshr \coef1, \coef1, #6
+ ld1 {v3.8b}, [x3], x1
+ srshr \coef2, \coef2, #6
+ ld1 {v4.8b}, [x0], x1
+ srshr \coef3, \coef3, #6
+ uaddw \coef0, \coef0, v2.8b
+ ld1 {v5.8b}, [x3], x1
+ uaddw \coef1, \coef1, v3.8b
+ srshr \coef4, \coef4, #6
+ ld1 {v6.8b}, [x0], x1
+ srshr \coef5, \coef5, #6
+ ld1 {v7.8b}, [x3], x1
+ sqxtun v2.8b, \coef0
+ srshr \coef6, \coef6, #6
+ sqxtun v3.8b, \coef1
+ srshr \coef7, \coef7, #6
+ uaddw \coef2, \coef2, v4.8b
+ ld1 {\tmp1}, [x0], x1
+ uaddw \coef3, \coef3, v5.8b
+ ld1 {\tmp2}, [x3], x1
+ sqxtun v4.8b, \coef2
+ sub x0, x0, x1, lsl #2
+ sub x3, x3, x1, lsl #2
+ sqxtun v5.8b, \coef3
+ uaddw \coef4, \coef4, v6.8b
+ st1 {v2.8b}, [x0], x1
+ uaddw \coef5, \coef5, v7.8b
+ st1 {v3.8b}, [x3], x1
+ sqxtun v6.8b, \coef4
+ st1 {v4.8b}, [x0], x1
+ sqxtun v7.8b, \coef5
+ st1 {v5.8b}, [x3], x1
+ uaddw \coef6, \coef6, \tmp1
+ st1 {v6.8b}, [x0], x1
+ uaddw \coef7, \coef7, \tmp2
+ st1 {v7.8b}, [x3], x1
+ sqxtun \tmp1, \coef6
+ sqxtun \tmp2, \coef7
+ st1 {\tmp1}, [x0], x1
+ st1 {\tmp2}, [x3], x1
+.endm
+
// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
// transpose into a horizontal 16x8 slice and store.
// x0 = dst (temp buffer)
@@ -728,37 +773,12 @@ function \txfm\()16_1d_8x16_pass1_neon
mov x9, #32
movi v2.8h, #0
-
-.ifc \txfm,idct
- cmp w3, #10
- b.le 3f
- cmp w3, #38
- b.le 4f
-.endif
-
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load_clear \i, x2, x9
.endr
bl \txfm\()16
-.ifc \txfm,idct
- b 5f
-3:
-.irp i, 16, 17, 18, 19
- load_clear \i, x2, x9
-.endr
- bl idct16_quarter
- b 5f
-
-4:
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- load_clear \i, x2, x9
-.endr
- bl idct16_half
-.endif
-
-5:
// Do two 8x8 transposes. Originally, v16-v31 contain the
// 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
// transposed 8x8 blocks.
@@ -812,92 +832,25 @@ endfunc
// x0 = dst
// x1 = dst stride
// x2 = src (temp buffer)
-// w3 = eob
-// x13 = slice offset
+// x3 = slice offset
function \txfm\()16_1d_8x16_pass2_neon
mov x14, x30
mov x9, #32
-.ifc \txfm,idct
- cmp w3, #10
- b.le 3f
- cmp w3, #38
- b.le 4f
-.endif
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load \i, x2, x9
.endr
- cbz x13, 1f
+ cbz x3, 1f
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
load \i, x2, x9
.endr
1:
+ add x3, x0, x1
+ lsl x1, x1, #1
bl \txfm\()16
-.ifc \txfm,idct
- b 5f
-3:
-.irp i, 16, 17, 18, 19
- load \i, x2, x9
-.endr
- bl idct16_quarter
- b 5f
-4:
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- load \i, x2, x9
-.endr
- bl idct16_half
-.endif
-5:
- add x8, x0, x1
- lsl x1, x1, #1
-.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
- srshr \coef0, \coef0, #6
- ld1 {v2.8b}, [x0], x1
- srshr \coef1, \coef1, #6
- ld1 {v3.8b}, [x8], x1
- srshr \coef2, \coef2, #6
- ld1 {v4.8b}, [x0], x1
- srshr \coef3, \coef3, #6
- uaddw \coef0, \coef0, v2.8b
- ld1 {v5.8b}, [x8], x1
- uaddw \coef1, \coef1, v3.8b
- srshr \coef4, \coef4, #6
- ld1 {v6.8b}, [x0], x1
- srshr \coef5, \coef5, #6
- ld1 {v7.8b}, [x8], x1
- sqxtun v2.8b, \coef0
- srshr \coef6, \coef6, #6
- sqxtun v3.8b, \coef1
- srshr \coef7, \coef7, #6
- uaddw \coef2, \coef2, v4.8b
- ld1 {\tmp1}, [x0], x1
- uaddw \coef3, \coef3, v5.8b
- ld1 {\tmp2}, [x8], x1
- sqxtun v4.8b, \coef2
- sub x0, x0, x1, lsl #2
- sub x8, x8, x1, lsl #2
- sqxtun v5.8b, \coef3
- uaddw \coef4, \coef4, v6.8b
- st1 {v2.8b}, [x0], x1
- uaddw \coef5, \coef5, v7.8b
- st1 {v3.8b}, [x8], x1
- sqxtun v6.8b, \coef4
- st1 {v4.8b}, [x0], x1
- sqxtun v7.8b, \coef5
- st1 {v5.8b}, [x8], x1
- uaddw \coef6, \coef6, \tmp1
- st1 {v6.8b}, [x0], x1
- uaddw \coef7, \coef7, \tmp2
- st1 {v7.8b}, [x8], x1
- sqxtun \tmp1, \coef6
- sqxtun \tmp2, \coef7
- st1 {\tmp1}, [x0], x1
- st1 {\tmp2}, [x8], x1
-.endm
load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
-.purgem load_add_store
br x14
endfunc
@@ -916,6 +869,10 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.ifc \txfm1\()_\txfm2,idct_idct
cmp w3, #1
b.eq idct16x16_dc_add_neon
+ cmp w3, #10
+ b.eq idct16x16_quarter_add_neon
+ cmp w3, #38
+ b.eq idct16x16_half_add_neon
.endif
mov x15, x30
// iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
@@ -936,7 +893,6 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.ifnc \txfm1\()_\txfm2,idct_idct
movrel x11, iadst16_coeffs
mov x7, #0
- mov w3, #256
.else
movrel x12, min_eob_idct_idct_16
.endif
@@ -960,7 +916,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
add x0, x4, #(\i)
mov x1, x5
add x2, sp, #(\i*2)
- mov x13, #\i
+ mov x3, #\i
bl \txfm2\()16_1d_8x16_pass2_neon
.endr
@@ -980,6 +936,163 @@ itxfm_func16x16 iadst, idct
itxfm_func16x16 idct, iadst
itxfm_func16x16 iadst, iadst
+function idct16_1d_8x16_pass1_quarter_neon
+ mov x14, x30
+ mov x9, #32
+ movi v2.8h, #0
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+
+ bl idct16_quarter
+
+ // Do two 8x8 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+ // transposed 8x8 blocks.
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+ // Store the transposed 8x8 blocks horizontally.
+ // The first 8x8 block is kept in registers for the second pass,
+ // store the rest in the temp buffer.
+ // Since only a 4x4 part of the input was nonzero,
+ // this means that only 4 rows are nonzero after transposing, and
+ // the second pass only reads the topmost 4 rows. Therefore only
+ // store the topmost 4 rows.
+.irp i, 24, 25, 26, 27
+ add x0, x0, #16
+ store \i, x0, #16
+.endr
+ br x14
+endfunc
+
+function idct16_1d_8x16_pass2_quarter_neon
+ mov x14, x30
+ mov x9, #32
+ cbz x3, 1f
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl idct16_quarter
+
+ load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+ load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+ br x14
+endfunc
+
+function idct16_1d_8x16_pass1_half_neon
+ mov x14, x30
+ mov x9, #32
+ movi v2.8h, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+
+ bl idct16_half
+
+ // Do two 8x8 transposes. Originally, v16-v31 contain the
+ // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+ // transposed 8x8 blocks.
+ transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+ transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+ // Store the transposed 8x8 blocks horizontally.
+ // The first 8x8 block is kept in registers for the second pass,
+ // store the rest in the temp buffer.
+ // Since only a 4x4 part of the input was nonzero,
+ // this means that only 4 rows are nonzero after transposing, and
+ // the second pass only reads the topmost 4 rows. Therefore only
+ // store the topmost 4 rows.
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+ add x0, x0, #16
+ store \i, x0, #16
+.endr
+ br x14
+endfunc
+
+function idct16_1d_8x16_pass2_half_neon
+ mov x14, x30
+ mov x9, #32
+ cbz x3, 1f
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+1:
+
+ add x3, x0, x1
+ lsl x1, x1, #1
+ bl idct16_half
+
+ load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+ load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+ br x14
+endfunc
+
+function idct16x16_quarter_add_neon, export=1
+ mov x15, x30
+
+ sub sp, sp, #512
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+ movrel x10, idct_coeffs
+ ld1 {v0.8h,v1.8h}, [x10]
+
+.irp i, 0
+ add x0, sp, #(\i*32)
+ mov x1, #\i
+ add x2, x6, #(\i*2)
+ bl idct16_1d_8x16_pass1_quarter_neon
+.endr
+.irp i, 0, 8
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ mov x3, #\i
+ bl idct16_1d_8x16_pass2_quarter_neon
+.endr
+
+ add sp, sp, #512
+ br x15
+endfunc
+
+function idct16x16_half_add_neon, export=1
+ mov x15, x30
+
+ sub sp, sp, #512
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+ movrel x10, idct_coeffs
+ ld1 {v0.8h,v1.8h}, [x10]
+
+.irp i, 0
+ add x0, sp, #(\i*32)
+ mov x1, #\i
+ add x2, x6, #(\i*2)
+ bl idct16_1d_8x16_pass1_half_neon
+.endr
+.irp i, 0, 8
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ mov x3, #\i
+ bl idct16_1d_8x16_pass2_half_neon
+.endr
+
+ add sp, sp, #512
+ br x15
+endfunc
function idct32x32_dc_add_neon
movrel x4, idct_coeffs
@@ -1160,6 +1273,85 @@ function idct32_odd_quarter
endfunc
+// Store the registers a, b horizontally, followed by the
+// same registers b, a mirrored.
+.macro store_rev1 a, b
+ // There's no rev128 instruction, but we reverse each 64 bit
+ // half, and then flip them using an ext with 8 bytes offset.
+ rev64 v1.8h, v\b\().8h
+ st1 {v\a\().8h}, [x0], #16
+ rev64 v0.8h, v\a\().8h
+ ext v1.16b, v1.16b, v1.16b, #8
+ st1 {v\b\().8h}, [x0], #16
+ ext v0.16b, v0.16b, v0.16b, #8
+ st1 {v1.8h}, [x0], #16
+ st1 {v0.8h}, [x0], #16
+.endm
+
+// Store the registers a, b horizontally,
+// adding into the output first, and the mirrored,
+// subtracted from the output.
+.macro store_rev2 a, b
+ ld1 {v4.8h}, [x0]
+ rev64 v1.8h, v\b\().8h
+ add v4.8h, v4.8h, v\a\().8h
+ rev64 v0.8h, v\a\().8h
+ st1 {v4.8h}, [x0], #16
+ ext v1.16b, v1.16b, v1.16b, #8
+ ld1 {v5.8h}, [x0]
+ ext v0.16b, v0.16b, v0.16b, #8
+ add v5.8h, v5.8h, v\b\().8h
+ st1 {v5.8h}, [x0], #16
+ ld1 {v6.8h}, [x0]
+ sub v6.8h, v6.8h, v1.8h
+ st1 {v6.8h}, [x0], #16
+ ld1 {v7.8h}, [x0]
+ sub v7.8h, v7.8h, v0.8h
+ st1 {v7.8h}, [x0], #16
+.endm
+
+.macro load_acc_store a, b, c, d, neg=0
+ ld1 {v4.8h}, [x2], x9
+ ld1 {v5.8h}, [x2], x9
+.if \neg == 0
+ add v4.8h, v4.8h, v\a\().8h
+ ld1 {v6.8h}, [x2], x9
+ add v5.8h, v5.8h, v\b\().8h
+ ld1 {v7.8h}, [x2], x9
+ add v6.8h, v6.8h, v\c\().8h
+ add v7.8h, v7.8h, v\d\().8h
+.else
+ sub v4.8h, v4.8h, v\a\().8h
+ ld1 {v6.8h}, [x2], x9
+ sub v5.8h, v5.8h, v\b\().8h
+ ld1 {v7.8h}, [x2], x9
+ sub v6.8h, v6.8h, v\c\().8h
+ sub v7.8h, v7.8h, v\d\().8h
+.endif
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ srshr v4.8h, v4.8h, #6
+ ld1 {v2.8b}, [x0], x1
+ srshr v5.8h, v5.8h, #6
+ uaddw v4.8h, v4.8h, v0.8b
+ ld1 {v3.8b}, [x0], x1
+ srshr v6.8h, v6.8h, #6
+ uaddw v5.8h, v5.8h, v1.8b
+ srshr v7.8h, v7.8h, #6
+ sub x0, x0, x1, lsl #2
+ uaddw v6.8h, v6.8h, v2.8b
+ sqxtun v4.8b, v4.8h
+ uaddw v7.8h, v7.8h, v3.8b
+ sqxtun v5.8b, v5.8h
+ st1 {v4.8b}, [x0], x1
+ sqxtun v6.8b, v6.8h
+ st1 {v5.8b}, [x0], x1
+ sqxtun v7.8b, v7.8h
+ st1 {v6.8b}, [x0], x1
+ st1 {v7.8b}, [x0], x1
+.endm
+
+.macro idct32_funcs suffix
// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
// The 32-point IDCT can be decomposed into two 16-point IDCTs;
// a normal IDCT16 with every other input component (the even ones, with
@@ -1171,149 +1363,102 @@ endfunc
// w3 = eob
// x10 = idct_coeffs
// x11 = idct_coeffs + 32
-function idct32_1d_8x32_pass1_neon
+function idct32_1d_8x32_pass1\suffix\()_neon
// Check if this whole input slice is zero
+.ifb \suffix
cmp w3, w1
b.le 1f
+.endif
mov x14, x30
ld1 {v0.8h,v1.8h}, [x10]
// Double stride of the input, since we only read every other line
mov x9, #128
- movi v4.8h, #0
-
- cmp w3, #4
- b.le 3f
- cmp w3, #135
- b.le 4f
+ movi v2.8h, #0
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
-
- bl idct16
- sub x2, x2, x9, lsl #4
- b 5f
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
- bl idct16_quarter
- sub x2, x2, x9, lsl #2
- b 5f
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
- bl idct16_half
- sub x2, x2, x9, lsl #3
+.endif
+
+ bl idct16\suffix
-5:
// Do two 8x8 transposes. Originally, v16-v31 contain the
// 16 rows. Afterwards, v16-v23 and v24-v31 contain the
// two transposed 8x8 blocks.
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
- // Store the registers a, b horizontally, followed by the
- // same registers b, a mirrored.
-.macro store_rev a, b
- // There's no rev128 instruction, but we reverse each 64 bit
- // half, and then flip them using an ext with 8 bytes offset.
- rev64 v1.8h, v\b\().8h
- st1 {v\a\().8h}, [x0], #16
- rev64 v0.8h, v\a\().8h
- ext v1.16b, v1.16b, v1.16b, #8
- st1 {v\b\().8h}, [x0], #16
- ext v0.16b, v0.16b, v0.16b, #8
- st1 {v1.8h}, [x0], #16
- st1 {v0.8h}, [x0], #16
-.endm
- store_rev 16, 24
- store_rev 17, 25
- store_rev 18, 26
- store_rev 19, 27
- store_rev 20, 28
- store_rev 21, 29
- store_rev 22, 30
- store_rev 23, 31
+ store_rev1 16, 24
+ store_rev1 17, 25
+ store_rev1 18, 26
+ store_rev1 19, 27
+ store_rev1 20, 28
+ store_rev1 21, 29
+ store_rev1 22, 30
+ store_rev1 23, 31
sub x0, x0, #512
-.purgem store_rev
- // Move x2 to the first odd row
+ // Move x2 back to the start of the input, and move
+ // to the first odd row
+.ifb \suffix
+ sub x2, x2, x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+ sub x2, x2, x9, lsl #2
+.endif
+.ifc \suffix,_half
+ sub x2, x2, x9, lsl #3
+.endif
add x2, x2, #64
- movi v4.8h, #0
-
- cmp w3, #34
- b.le 3f
- cmp w3, #135
- b.le 4f
-
+ movi v2.8h, #0
// v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
-
- bl idct32_odd
- b 5f
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
- bl idct32_odd_quarter
- b 5f
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x2]
- st1 {v4.8h}, [x2], x9
+ load_clear \i, x2, x9
.endr
- bl idct32_odd_half
+.endif
+
+ bl idct32_odd\suffix
-5:
transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
- // Store the registers a, b horizontally,
- // adding into the output first, and the mirrored,
- // subtracted from the output.
-.macro store_rev a, b
- ld1 {v4.8h}, [x0]
- rev64 v1.8h, v\b\().8h
- add v4.8h, v4.8h, v\a\().8h
- rev64 v0.8h, v\a\().8h
- st1 {v4.8h}, [x0], #16
- ext v1.16b, v1.16b, v1.16b, #8
- ld1 {v5.8h}, [x0]
- ext v0.16b, v0.16b, v0.16b, #8
- add v5.8h, v5.8h, v\b\().8h
- st1 {v5.8h}, [x0], #16
- ld1 {v6.8h}, [x0]
- sub v6.8h, v6.8h, v1.8h
- st1 {v6.8h}, [x0], #16
- ld1 {v7.8h}, [x0]
- sub v7.8h, v7.8h, v0.8h
- st1 {v7.8h}, [x0], #16
-.endm
-
- store_rev 31, 23
- store_rev 30, 22
- store_rev 29, 21
- store_rev 28, 20
- store_rev 27, 19
- store_rev 26, 18
- store_rev 25, 17
- store_rev 24, 16
-.purgem store_rev
+ store_rev2 31, 23
+ store_rev2 30, 22
+ store_rev2 29, 21
+ store_rev2 28, 20
+ store_rev2 27, 19
+ store_rev2 26, 18
+ store_rev2 25, 17
+ store_rev2 24, 16
br x14
+.ifb \suffix
1:
// Write zeros to the temp buffer for pass 2
movi v16.8h, #0
@@ -1324,6 +1469,7 @@ function idct32_1d_8x32_pass1_neon
st1 {v16.8h-v19.8h}, [x0], #64
.endr
ret
+.endif
endfunc
// This is mostly the same as 8x32_pass1, but without the transpose,
@@ -1334,116 +1480,63 @@ endfunc
// x2 = src (temp buffer)
// x10 = idct_coeffs
// x11 = idct_coeffs + 32
-function idct32_1d_8x32_pass2_neon
+function idct32_1d_8x32_pass2\suffix\()_neon
mov x14, x30
ld1 {v0.8h,v1.8h}, [x10]
mov x9, #128
-
- cmp w3, #34
- b.le 3f
- cmp w3, #135
- b.le 4f
-
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
-
- bl idct16
- b 5f
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #2
- bl idct16_quarter
- b 5f
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #3
- bl idct16_half
+.endif
+
+ bl idct16\suffix
-5:
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- st1 {v\i\().8h}, [x2], x9
+ store \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
add x2, x2, #64
- cmp w3, #34
- b.le 3f
- cmp w3, #135
- b.le 4f
-
// v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #4
-
- bl idct32_odd
- b 5f
-3:
+.endif
+.ifc \suffix,_quarter
.irp i, 16, 17, 18, 19
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #2
- bl idct32_odd_quarter
- b 5f
-4:
+.endif
+.ifc \suffix,_half
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x2], x9
+ load \i, x2, x9
.endr
sub x2, x2, x9, lsl #3
- bl idct32_odd_half
-
-5:
- sub x2, x2, #64
-.macro load_acc_store a, b, c, d, neg=0
- ld1 {v4.8h}, [x2], x9
- ld1 {v5.8h}, [x2], x9
-.if \neg == 0
- add v4.8h, v4.8h, v\a\().8h
- ld1 {v6.8h}, [x2], x9
- add v5.8h, v5.8h, v\b\().8h
- ld1 {v7.8h}, [x2], x9
- add v6.8h, v6.8h, v\c\().8h
- add v7.8h, v7.8h, v\d\().8h
-.else
- sub v4.8h, v4.8h, v\a\().8h
- ld1 {v6.8h}, [x2], x9
- sub v5.8h, v5.8h, v\b\().8h
- ld1 {v7.8h}, [x2], x9
- sub v6.8h, v6.8h, v\c\().8h
- sub v7.8h, v7.8h, v\d\().8h
.endif
- ld1 {v0.8b}, [x0], x1
- ld1 {v1.8b}, [x0], x1
- srshr v4.8h, v4.8h, #6
- ld1 {v2.8b}, [x0], x1
- srshr v5.8h, v5.8h, #6
- uaddw v4.8h, v4.8h, v0.8b
- ld1 {v3.8b}, [x0], x1
- srshr v6.8h, v6.8h, #6
- uaddw v5.8h, v5.8h, v1.8b
- srshr v7.8h, v7.8h, #6
- sub x0, x0, x1, lsl #2
- uaddw v6.8h, v6.8h, v2.8b
- sqxtun v4.8b, v4.8h
- uaddw v7.8h, v7.8h, v3.8b
- sqxtun v5.8b, v5.8h
- st1 {v4.8b}, [x0], x1
- sqxtun v6.8b, v6.8h
- st1 {v5.8b}, [x0], x1
- sqxtun v7.8b, v7.8h
- st1 {v6.8b}, [x0], x1
- st1 {v7.8b}, [x0], x1
-.endm
+ sub x2, x2, #64
+
+ bl idct32_odd\suffix
+
load_acc_store 31, 30, 29, 28
load_acc_store 27, 26, 25, 24
load_acc_store 23, 22, 21, 20
@@ -1454,9 +1547,13 @@ function idct32_1d_8x32_pass2_neon
load_acc_store 20, 21, 22, 23, 1
load_acc_store 24, 25, 26, 27, 1
load_acc_store 28, 29, 30, 31, 1
-.purgem load_acc_store
br x14
endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
const min_eob_idct_idct_32, align=4
.short 0, 34, 135, 336
@@ -1465,6 +1562,10 @@ endconst
function ff_vp9_idct_idct_32x32_add_neon, export=1
cmp w3, #1
b.eq idct32x32_dc_add_neon
+ cmp w3, #34
+ b.le idct32x32_quarter_add_neon
+ cmp w3, #135
+ b.le idct32x32_half_add_neon
movrel x10, idct_coeffs
add x11, x10, #32
@@ -1505,3 +1606,81 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
br x15
endfunc
+
+function idct32x32_quarter_add_neon
+ movrel x10, idct_coeffs
+ add x11, x10, #32
+
+ mov x15, x30
+
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+
+ sub sp, sp, #2048
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+.irp i, 0
+ add x0, sp, #(\i*64)
+ add x2, x6, #(\i*2)
+ bl idct32_1d_8x32_pass1_quarter_neon
+.endr
+.irp i, 0, 8, 16, 24
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ bl idct32_1d_8x32_pass2_quarter_neon
+.endr
+
+ add sp, sp, #2048
+
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+
+ br x15
+endfunc
+
+function idct32x32_half_add_neon
+ movrel x10, idct_coeffs
+ add x11, x10, #32
+
+ mov x15, x30
+
+ stp d14, d15, [sp, #-0x10]!
+ stp d12, d13, [sp, #-0x10]!
+ stp d10, d11, [sp, #-0x10]!
+ stp d8, d9, [sp, #-0x10]!
+
+ sub sp, sp, #2048
+
+ mov x4, x0
+ mov x5, x1
+ mov x6, x2
+
+.irp i, 0, 8
+ add x0, sp, #(\i*64)
+ add x2, x6, #(\i*2)
+ bl idct32_1d_8x32_pass1_half_neon
+.endr
+.irp i, 0, 8, 16, 24
+ add x0, x4, #(\i)
+ mov x1, x5
+ add x2, sp, #(\i*2)
+ bl idct32_1d_8x32_pass2_half_neon
+.endr
+
+ add sp, sp, #2048
+
+ ldp d8, d9, [sp], 0x10
+ ldp d10, d11, [sp], 0x10
+ ldp d12, d13, [sp], 0x10
+ ldp d14, d15, [sp], 0x10
+
+ br x15
+endfunc