[3/6] arm: Add assembly version of h264_find_start_code_candidate

Message ID 1375704771-14236-4-git-send-email-bavison@riscosopen.org
State New
Headers show

Commit Message

Ben Avison Aug. 5, 2013, 12:12 p.m.
Before          After
               Mean   StdDev   Mean   StdDev  Change
This function   508.8 23.4      185.4  9.0    +174.4%
Overall        3068.5 31.7     2752.1 29.4     +11.5%

In combination with the preceding patch:
                Before          After
                Mean   StdDev   Mean   StdDev  Change
Overall         2925.6 26.2     2752.1 29.4     +6.3%
---
 libavcodec/arm/Makefile           |    1 +
 libavcodec/arm/h264dsp_armv6.S    |  253 +++++++++++++++++++++++++++++++++++++
 libavcodec/arm/h264dsp_init_arm.c |    4 +
 3 files changed, 258 insertions(+), 0 deletions(-)
 create mode 100644 libavcodec/arm/h264dsp_armv6.S

Patch

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index e941aaa..9c64b36 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -45,6 +45,7 @@  ARMV6-OBJS-$(CONFIG_DSPUTIL)           += arm/dsputil_init_armv6.o      \
                                           arm/simple_idct_armv6.o       \
 
 ARMV6-OBJS-$(CONFIG_AC3DSP)            += arm/ac3dsp_armv6.o
+ARMV6-OBJS-$(CONFIG_H264DSP)           += arm/h264dsp_armv6.o
 ARMV6-OBJS-$(CONFIG_HPELDSP)           += arm/hpeldsp_init_armv6.o      \
                                           arm/hpeldsp_armv6.o
 ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
diff --git a/libavcodec/arm/h264dsp_armv6.S b/libavcodec/arm/h264dsp_armv6.S
new file mode 100644
index 0000000..c4f12a6
--- /dev/null
+++ b/libavcodec/arm/h264dsp_armv6.S
@@ -0,0 +1,253 @@ 
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+RESULT  .req    a1
+BUF     .req    a1
+SIZE    .req    a2
+PATTERN .req    a3
+PTR     .req    a4
+DAT0    .req    v1
+DAT1    .req    v2
+DAT2    .req    v3
+DAT3    .req    v4
+TMP0    .req    v5
+TMP1    .req    v6
+TMP2    .req    ip
+TMP3    .req    lr
+
+#define PRELOAD_DISTANCE 4
+
+.macro innerloop4
+        ldr     DAT0, [PTR], #4
+        subs    SIZE, SIZE, #4 @ C flag survives rest of macro
+        sub     TMP0, DAT0, PATTERN, lsr #14
+        bic     TMP0, TMP0, DAT0
+        ands    TMP0, TMP0, PATTERN
+.endm
+
+.macro innerloop16  decrement, do_preload
+        ldmia   PTR!, {DAT0,DAT1,DAT2,DAT3}
+ .ifnc "\do_preload",""
+        pld     [PTR, #PRELOAD_DISTANCE*32]
+ .endif
+ .ifnc "\decrement",""
+        subs    SIZE, SIZE, #\decrement @ C flag survives rest of macro
+ .endif
+        sub     TMP0, DAT0, PATTERN, lsr #14
+        sub     TMP1, DAT1, PATTERN, lsr #14
+        bic     TMP0, TMP0, DAT0
+        bic     TMP1, TMP1, DAT1
+        sub     TMP2, DAT2, PATTERN, lsr #14
+        sub     TMP3, DAT3, PATTERN, lsr #14
+        ands    TMP0, TMP0, PATTERN
+        bic     TMP2, TMP2, DAT2
+        it      eq
+        andseq  TMP1, TMP1, PATTERN
+        bic     TMP3, TMP3, DAT3
+        itt     eq
+        andseq  TMP2, TMP2, PATTERN
+        andseq  TMP3, TMP3, PATTERN
+.endm
+
+/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */
+function ff_h264_find_start_code_candidate_armv6, export=1
+        push    {v1-v6,lr}
+        mov     PTR, BUF
+        @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go
+        @ before using code that does preloads
+        cmp     SIZE, #(PRELOAD_DISTANCE+3)*32 - 1
+        blo     60f
+
+        @ Get to word-alignment, 1 byte at a time
+        tst     PTR, #3
+        beq     2f
+1:      ldrb    DAT0, [PTR], #1
+        sub     SIZE, SIZE, #1
+        teq     DAT0, #0
+        beq     90f
+        tst     PTR, #3
+        bne     1b
+2:      @ Get to 4-word alignment, 1 word at a time
+        ldr     PATTERN, =0x80008000
+        setend  be
+        tst     PTR, #12
+        beq     4f
+3:      innerloop4
+        bne     91f
+        tst     PTR, #12
+        bne     3b
+4:      @ Get to cacheline (8-word) alignment
+        tst     PTR, #16
+        beq     5f
+        innerloop16  16
+        bne     93f
+5:      @ Check complete cachelines, with preloading
+        @ We need to stop when there are still (PRELOAD_DISTANCE+1)
+        @ complete cachelines to go
+        sub     SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32
+6:      innerloop16  , do_preload
+        bne     93f
+        innerloop16  32
+        bne     93f
+        bcs     6b
+        @ Preload trailing part-cacheline, if any
+        tst     SIZE, #31
+        beq     7f
+        pld     [PTR, #(PRELOAD_DISTANCE+1)*32]
+        @ Check remaining data without doing any more preloads. First
+        @ do in chunks of 4 words:
+7:      adds    SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16
+        bmi     9f
+8:      innerloop16  16
+        bne     93f
+        bcs     8b
+        @ Then in words:
+9:      adds    SIZE, SIZE, #16 - 4
+        bmi     11f
+10:     innerloop4
+        bne     91f
+        bcs     10b
+11:     setend  le
+        @ Check second byte of final halfword
+        ldrb    DAT0, [PTR, #-1]
+        teq     DAT0, #0
+        beq     90f
+        @ Check any remaining bytes
+        tst     SIZE, #3
+        beq     13f
+12:     ldrb    DAT0, [PTR], #1
+        sub     SIZE, SIZE, #1
+        teq     DAT0, #0
+        beq     90f
+        tst     SIZE, #3
+        bne     12b
+        @ No candidate found
+13:     sub     RESULT, PTR, BUF
+        b       99f
+
+60:     @ Small buffer - simply check by looping over bytes
+        subs    SIZE, SIZE, #1
+        bcc     99f
+61:     ldrb    DAT0, [PTR], #1
+        subs    SIZE, SIZE, #1
+        teq     DAT0, #0
+        beq     90f
+        bcs     61b
+        @ No candidate found
+        sub     RESULT, PTR, BUF
+        b       99f
+
+90:     @ Found a candidate at the preceding byte
+        sub     RESULT, PTR, BUF
+        sub     RESULT, RESULT, #1
+        b       99f
+
+91:     @ Found a candidate somewhere in the preceding 4 bytes
+        sub     RESULT, PTR, BUF
+        sub     RESULT, RESULT, #4
+        sub     TMP0, DAT0, #0x20000
+        bics    TMP0, TMP0, DAT0
+        itt     pl
+        ldrbpl  DAT0, [PTR, #-3]
+        addpl   RESULT, RESULT, #2
+        bpl     92f
+        teq     RESULT, #0
+        beq     98f @ don't look back a byte if found at first byte in buffer
+        ldrb    DAT0, [PTR, #-5]
+92:     teq     DAT0, #0
+        it      eq
+        subeq   RESULT, RESULT, #1
+        b       98f
+
+93:     @ Found a candidate somewhere in the preceding 16 bytes
+        sub     RESULT, PTR, BUF
+        sub     RESULT, RESULT, #16
+        teq     TMP0, #0
+        beq     95f @ not in first 4 bytes
+        sub     TMP0, DAT0, #0x20000
+        bics    TMP0, TMP0, DAT0
+        itt     pl
+        ldrbpl  DAT0, [PTR, #-15]
+        addpl   RESULT, RESULT, #2
+        bpl     94f
+        teq     RESULT, #0
+        beq     98f @ don't look back a byte if found at first byte in buffer
+        ldrb    DAT0, [PTR, #-17]
+94:     teq     DAT0, #0
+        it      eq
+        subeq   RESULT, RESULT, #1
+        b       98f
+95:     add     RESULT, RESULT, #4
+        teq     TMP1, #0
+        beq     96f @ not in next 4 bytes
+        sub     TMP1, DAT1, #0x20000
+        bics    TMP1, TMP1, DAT1
+        itee    mi
+        ldrbmi  DAT0, [PTR, #-13]
+        ldrbpl  DAT0, [PTR, #-11]
+        addpl   RESULT, RESULT, #2
+        teq     DAT0, #0
+        it      eq
+        subeq   RESULT, RESULT, #1
+        b       98f
+96:     add     RESULT, RESULT, #4
+        teq     TMP2, #0
+        beq     97f @ not in next 4 bytes
+        sub     TMP2, DAT2, #0x20000
+        bics    TMP2, TMP2, DAT2
+        itee    mi
+        ldrbmi  DAT0, [PTR, #-9]
+        ldrbpl  DAT0, [PTR, #-7]
+        addpl   RESULT, RESULT, #2
+        teq     DAT0, #0
+        it      eq
+        subeq   RESULT, RESULT, #1
+        b       98f
+97:     add     RESULT, RESULT, #4
+        sub     TMP3, DAT3, #0x20000
+        bics    TMP3, TMP3, DAT3
+        itee    mi
+        ldrbmi  DAT0, [PTR, #-5]
+        ldrbpl  DAT0, [PTR, #-3]
+        addpl   RESULT, RESULT, #2
+        teq     DAT0, #0
+        it      eq
+        subeq   RESULT, RESULT, #1
+        @ drop through to 98f
+98:     setend  le
+99:     pop     {v1-v6,pc}
+.endfunc
+
+        .unreq  RESULT
+        .unreq  BUF
+        .unreq  SIZE
+        .unreq  PATTERN
+        .unreq  PTR
+        .unreq  DAT0
+        .unreq  DAT1
+        .unreq  DAT2
+        .unreq  DAT3
+        .unreq  TMP0
+        .unreq  TMP1
+        .unreq  TMP2
+        .unreq  TMP3
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index bb8b3b9..b206a1b 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -24,6 +24,8 @@ 
 #include "libavutil/arm/cpu.h"
 #include "libavcodec/h264dsp.h"
 
+int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size);
+
 void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
                                      int beta, int8_t *tc0);
 void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
@@ -102,6 +104,8 @@  av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
 {
     int cpu_flags = av_get_cpu_flags();
 
+    if (have_armv6(cpu_flags))
+        c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6;
     if (have_neon(cpu_flags))
         h264dsp_init_neon(c, bit_depth, chroma_format_idc);
 }