Message ID | 1480584422-24237-1-git-send-email-martin@martin.st |
---|---|
State | Committed |
Commit | 0331c3f5e8cb6e6b53fab7893e91d1be1bfa979c |
Headers | show |
On 2016-12-01 11:26:56 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from > 15324 to 12388 bytes. > > This gives a small slowdown of a couple tens of cycles, up to around > 150 cycles for the full case of the largest transform, but makes > it more feasible to add more optimized versions of these transforms. > > Before: Cortex A7 A8 A9 A53 > vp9_inv_dct_dct_16x16_sub4_add_neon: 2063.4 1516.0 1719.5 1245.1 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3279.3 2454.5 2525.2 1982.3 > vp9_inv_dct_dct_32x32_sub4_add_neon: 10750.0 7955.4 8525.6 6754.2 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18574.0 17108.4 14216.7 12010.2 > > After: > vp9_inv_dct_dct_16x16_sub4_add_neon: 2060.8 1608.5 1735.7 1262.0 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5 > vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9 > --- > libavcodec/arm/vp9itxfm_neon.S | 43 +++++++++++++++++++++++++----------------- > 1 file changed, 26 insertions(+), 17 deletions(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index 5abe435..22e63e5 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -534,7 +534,7 @@ function idct16x16_dc_add_neon > endfunc > .ltorg > > -.macro idct16 > +function idct16 > mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a > mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a > mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a > @@ -580,9 +580,10 @@ endfunc > vmov d4, d21 @ d4 = t10a > butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11] > butterfly d21, d26, d26, d4 @ d21 = out[5], d26 = out[10] > -.endm > + bx lr > +endfunc > > -.macro iadst16 > +function iadst16 > movrel r12, iadst16_coeffs > vld1.16 {q0-q1}, [r12,:128] > > @@ -653,7 +654,8 @@ endfunc > > vmov d16, d2 > vmov d30, d4 > -.endm > + bx lr > +endfunc > > .macro itxfm16_1d_funcs txfm > @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, > @@ -662,6 +664,8 @@ endfunc > @ r1 = slice offset > @ r2 = src > function \txfm\()16_1d_4x16_pass1_neon > + push {lr} > + > mov r12, #32 > vmov.s16 q2, #0 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > @@ -669,7 +673,7 @@ function \txfm\()16_1d_4x16_pass1_neon > vst1.16 {d4}, [r2,:64], r12 > .endr > > - \txfm\()16 > + bl \txfm\()16 > > @ Do four 4x4 transposes. Originally, d16-d31 contain the > @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 > @@ -682,7 +686,7 @@ function \txfm\()16_1d_4x16_pass1_neon > .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 > vst1.16 {d\i}, [r0,:64]! > .endr > - bx lr > + pop {pc} > 1: > @ Special case: For the last input column (r1 == 12), > @ which would be stored as the last row in the temp buffer, > @@ -709,7 +713,7 @@ function \txfm\()16_1d_4x16_pass1_neon > vmov d29, d17 > vmov d30, d18 > vmov d31, d19 > - bx lr > + pop {pc} > endfunc > > @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, > @@ -719,6 +723,7 @@ endfunc > @ r2 = src (temp buffer) > @ r3 = slice offset > function \txfm\()16_1d_4x16_pass2_neon > + push {lr} > mov r12, #32 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 > vld1.16 {d\i}, [r2,:64], r12 > @@ -732,7 +737,7 @@ function \txfm\()16_1d_4x16_pass2_neon > > add r3, r0, r1 > lsl r1, r1, #1 > - \txfm\()16 > + bl \txfm\()16 > > .macro load_add_store coef0, coef1, coef2, coef3 > vrshr.s16 \coef0, \coef0, #6 > @@ -773,7 +778,7 @@ function \txfm\()16_1d_4x16_pass2_neon > load_add_store q12, q13, q14, q15 > .purgem load_add_store > > - bx lr > + pop {pc} > endfunc > .endm > > @@ -908,7 +913,7 @@ function idct32x32_dc_add_neon > bx lr > endfunc > > -.macro idct32_odd > +function idct32_odd > movrel r12, idct_coeffs > add r12, r12, #32 > vld1.16 {q0-q1}, [r12,:128] > @@ -967,7 +972,8 @@ endfunc > mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a > mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 > mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a > -.endm > + bx lr > +endfunc > > @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. > @ We don't have register space to do a single pass IDCT of 4x32 though, > @@ -979,6 +985,8 @@ endfunc > @ r1 = unused > @ r2 = src > function idct32_1d_4x32_pass1_neon > + push {lr} > + > movrel r12, idct_coeffs > vld1.16 {q0-q1}, [r12,:128] > > @@ -992,7 +1000,7 @@ function idct32_1d_4x32_pass1_neon > vst1.16 {d4}, [r2,:64], r12 > .endr > > - idct16 > + bl idct16 > > @ Do four 4x4 transposes. Originally, d16-d31 contain the > @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 > @@ -1028,7 +1036,7 @@ function idct32_1d_4x32_pass1_neon > vst1.16 {d4}, [r2,:64], r12 > .endr > > - idct32_odd > + bl idct32_odd > > transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 > > @@ -1054,7 +1062,7 @@ function idct32_1d_4x32_pass1_neon > store_rev 29, 25, 21, 17 > store_rev 28, 24, 20, 16 > .purgem store_rev > - bx lr > + pop {pc} > endfunc > .ltorg > > @@ -1065,6 +1073,7 @@ endfunc > @ r1 = dst stride > @ r2 = src (temp buffer) > function idct32_1d_4x32_pass2_neon > + push {lr} > movrel r12, idct_coeffs > vld1.16 {q0-q1}, [r12,:128] > > @@ -1075,7 +1084,7 @@ function idct32_1d_4x32_pass2_neon > .endr > sub r2, r2, r12, lsl #4 > > - idct16 > + bl idct16 > > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > vst1.16 {d\i}, [r2,:64], r12 > @@ -1091,7 +1100,7 @@ function idct32_1d_4x32_pass2_neon > sub r2, r2, r12, lsl #4 > sub r2, r2, #64 > > - idct32_odd > + bl idct32_odd > > mov r12, #128 > .macro load_acc_store a, b, c, d, neg=0 > @@ -1139,7 +1148,7 @@ function idct32_1d_4x32_pass2_neon > load_acc_store 24, 25, 26, 27, 1 > load_acc_store 28, 29, 30, 31, 1 > .purgem load_acc_store > - bx lr > + pop {pc} > endfunc > > const min_eob_idct_idct_32, align=4 ok. sorry for the delay. Janne
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 5abe435..22e63e5 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -534,7 +534,7 @@ function idct16x16_dc_add_neon endfunc .ltorg -.macro idct16 +function idct16 mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a @@ -580,9 +580,10 @@ endfunc vmov d4, d21 @ d4 = t10a butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11] butterfly d21, d26, d26, d4 @ d21 = out[5], d26 = out[10] -.endm + bx lr +endfunc -.macro iadst16 +function iadst16 movrel r12, iadst16_coeffs vld1.16 {q0-q1}, [r12,:128] @@ -653,7 +654,8 @@ endfunc vmov d16, d2 vmov d30, d4 -.endm + bx lr +endfunc .macro itxfm16_1d_funcs txfm @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @@ -662,6 +664,8 @@ endfunc @ r1 = slice offset @ r2 = src function \txfm\()16_1d_4x16_pass1_neon + push {lr} + mov r12, #32 vmov.s16 q2, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -669,7 +673,7 @@ function \txfm\()16_1d_4x16_pass1_neon vst1.16 {d4}, [r2,:64], r12 .endr - \txfm\()16 + bl \txfm\()16 @ Do four 4x4 transposes. Originally, d16-d31 contain the @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 @@ -682,7 +686,7 @@ function \txfm\()16_1d_4x16_pass1_neon .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 vst1.16 {d\i}, [r0,:64]! .endr - bx lr + pop {pc} 1: @ Special case: For the last input column (r1 == 12), @ which would be stored as the last row in the temp buffer, @@ -709,7 +713,7 @@ function \txfm\()16_1d_4x16_pass1_neon vmov d29, d17 vmov d30, d18 vmov d31, d19 - bx lr + pop {pc} endfunc @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @@ -719,6 +723,7 @@ endfunc @ r2 = src (temp buffer) @ r3 = slice offset function \txfm\()16_1d_4x16_pass2_neon + push {lr} mov r12, #32 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 vld1.16 {d\i}, [r2,:64], r12 @@ -732,7 +737,7 @@ function \txfm\()16_1d_4x16_pass2_neon add r3, r0, r1 lsl r1, r1, #1 - \txfm\()16 + bl \txfm\()16 .macro load_add_store coef0, coef1, coef2, coef3 vrshr.s16 \coef0, \coef0, #6 @@ -773,7 +778,7 @@ function \txfm\()16_1d_4x16_pass2_neon load_add_store q12, q13, q14, q15 .purgem load_add_store - bx lr + pop {pc} endfunc .endm @@ -908,7 +913,7 @@ function idct32x32_dc_add_neon bx lr endfunc -.macro idct32_odd +function idct32_odd movrel r12, idct_coeffs add r12, r12, #32 vld1.16 {q0-q1}, [r12,:128] @@ -967,7 +972,8 @@ endfunc mbutterfly0 d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a -.endm + bx lr +endfunc @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. @ We don't have register space to do a single pass IDCT of 4x32 though, @@ -979,6 +985,8 @@ endfunc @ r1 = unused @ r2 = src function idct32_1d_4x32_pass1_neon + push {lr} + movrel r12, idct_coeffs vld1.16 {q0-q1}, [r12,:128] @@ -992,7 +1000,7 @@ function idct32_1d_4x32_pass1_neon vst1.16 {d4}, [r2,:64], r12 .endr - idct16 + bl idct16 @ Do four 4x4 transposes. Originally, d16-d31 contain the @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 @@ -1028,7 +1036,7 @@ function idct32_1d_4x32_pass1_neon vst1.16 {d4}, [r2,:64], r12 .endr - idct32_odd + bl idct32_odd transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 @@ -1054,7 +1062,7 @@ function idct32_1d_4x32_pass1_neon store_rev 29, 25, 21, 17 store_rev 28, 24, 20, 16 .purgem store_rev - bx lr + pop {pc} endfunc .ltorg @@ -1065,6 +1073,7 @@ endfunc @ r1 = dst stride @ r2 = src (temp buffer) function idct32_1d_4x32_pass2_neon + push {lr} movrel r12, idct_coeffs vld1.16 {q0-q1}, [r12,:128] @@ -1075,7 +1084,7 @@ function idct32_1d_4x32_pass2_neon .endr sub r2, r2, r12, lsl #4 - idct16 + bl idct16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vst1.16 {d\i}, [r2,:64], r12 @@ -1091,7 +1100,7 @@ function idct32_1d_4x32_pass2_neon sub r2, r2, r12, lsl #4 sub r2, r2, #64 - idct32_odd + bl idct32_odd mov r12, #128 .macro load_acc_store a, b, c, d, neg=0 @@ -1139,7 +1148,7 @@ function idct32_1d_4x32_pass2_neon load_acc_store 24, 25, 26, 27, 1 load_acc_store 28, 29, 30, 31, 1 .purgem load_acc_store - bx lr + pop {pc} endfunc const min_eob_idct_idct_32, align=4