@@ -588,6 +588,9 @@ endfunc
.macro store i, dst, inc
st1 {v\i\().8h}, [\dst], \inc
.endm
+.macro movi_v i, size, imm
+ movi v\i\()\size, \imm
+.endm
.macro load_clear i, src, inc
ld1 {v\i\().8h}, [\src]
st1 {v2.8h}, [\src], \inc
@@ -596,11 +599,18 @@ endfunc
// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
// transpose into a horizontal 16x8 slice and store.
// x0 = dst (temp buffer)
-// x1 = unused
+// x1 = slice offset
// x2 = src
-// x3 = slice offset
+// w3 = eob
+// w7 = min eob
.macro itxfm16_1d_funcs txfm
function \txfm\()16_1d_8x16_pass1_neon
+.ifc \txfm,idct
+ // Check if this whole input slice is zero
+ cmp w3, w7
+ b.le 2f
+.endif
+
mov x9, #32
movi v2.8h, #0
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -616,14 +626,14 @@ function \txfm\()16_1d_8x16_pass1_neon
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
// Store the transposed 8x8 blocks horizontally.
- cmp x3, #8
+ cmp x1, #8
b.eq 1f
.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
store \i, x0, #16
.endr
ret
1:
- // Special case: For the last input column (x3 == 8),
+ // Special case: For the last input column (x1 == 8),
// which would be stored as the last row in the temp buffer,
// don't store the first 8x8 block, but keep it in registers
// for the first slice of the second pass (where it is the
@@ -641,6 +651,20 @@ function \txfm\()16_1d_8x16_pass1_neon
mov v30.16b, v22.16b
mov v31.16b, v23.16b
ret
+
+.ifc \txfm,idct
+2:
+ // Set v24-v31 to zero, for the in-register passthrough of
+ // coefficients to pass 2. Since we only do two slices, this can
+ // only ever happen for the second slice. So we only need to store
+ // zeros to the temp buffer for the second half of the buffer.
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+ add x0, x0, #16
+ movi_v \i, .16b, #0
+ store 24, x0, #16
+.endr
+ ret
+.endif
endfunc
// Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
@@ -719,6 +743,11 @@ endfunc
itxfm16_1d_funcs idct
itxfm16_1d_funcs iadst
+// This is the minimum eob value for each subpartition, in increments of 8
+const min_eob_idct_idct_16, align=4
+ .short 0, 38
+endconst
+
.macro itxfm_func16x16 txfm1, txfm2
function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.ifc \txfm1\()_\txfm2,idct_idct
@@ -743,6 +772,9 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
movrel x10, idct_coeffs
.ifnc \txfm1\()_\txfm2,idct_idct
movrel x11, iadst16_coeffs
+ mov x7, #0
+.else
+ movrel x12, min_eob_idct_idct_16
.endif
.ifc \txfm1,idct
ld1 {v0.8h,v1.8h}, [x10]
@@ -750,8 +782,11 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.irp i, 0, 8
add x0, sp, #(\i*32)
+ mov x1, #\i
add x2, x6, #(\i*2)
- mov x3, #\i
+.ifc \txfm1\()_\txfm2,idct_idct
+ ldrh w7, [x12, #(\i/4)]
+.endif
bl \txfm1\()16_1d_8x16_pass1_neon
.endr
.ifc \txfm1\()_\txfm2,iadst_idct
@@ -880,11 +915,16 @@ endfunc
// each output written twice), followed by a separate 16-point IDCT
// of the odd inputs, added/subtracted onto the outputs of the first idct16.
// x0 = dst (temp buffer)
-// x1 = unused
+// w1 = min eob
// x2 = src
+// w3 = eob
// x10 = idct_coeffs
// x11 = idct_coeffs + 32
function idct32_1d_8x32_pass1_neon
+ // Check if this whole input slice is zero
+ cmp w3, w1
+ b.le 1f
+
ld1 {v0.8h,v1.8h}, [x10]
// Double stride of the input, since we only read every other line
@@ -979,6 +1019,17 @@ function idct32_1d_8x32_pass1_neon
store_rev 24, 16
.purgem store_rev
ret
+
+1:
+ // Write zeros to the temp buffer for pass 2
+ movi v16.8h, #0
+ movi v17.8h, #0
+ movi v18.8h, #0
+ movi v19.8h, #0
+.rept 8
+ st1 {v16.8h-v19.8h}, [x0], #64
+.endr
+ ret
endfunc
// This is mostly the same as 8x32_pass1, but without the transpose,
@@ -1071,12 +1122,17 @@ function idct32_1d_8x32_pass2_neon
ret
endfunc
+const min_eob_idct_idct_32, align=4
+ .short 0, 34, 135, 336
+endconst
+
function ff_vp9_idct_idct_32x32_add_neon, export=1
cmp w3, #1
b.eq idct32x32_dc_add_neon
movrel x10, idct_coeffs
add x11, x10, #32
+ movrel x12, min_eob_idct_idct_32
mov x15, x30
@@ -1093,6 +1149,7 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
.irp i, 0, 8, 16, 24
add x0, sp, #(\i*64)
+ ldrh w1, [x12, #(\i/4)]
add x2, x6, #(\i*2)
bl idct32_1d_8x32_pass1_neon
.endr