Patchwork target/xtensa: rework zero overhead loops implementation

login
register
mail settings
Submitter Max Filippov
Date Jan. 11, 2019, 11:49 a.m.
Message ID <20190111114918.27893-1-jcmvbkbc@gmail.com>
Download mbox | patch
Permalink /patch/697561/
State New
Headers show

Comments

Max Filippov - Jan. 11, 2019, 11:49 a.m.
Don't invalidate TB with the end of zero overhead loop when LBEG or LEND
change. Instead encode the distance from the start of the page where the
TB starts to the LEND in the TB cs_base and generate loopback code when
the next PC matches encoded LEND. Distance to a destination within the
same page and up to a maximum instruction length into the next page is
encoded literally, otherwise it's zero. The distance from LEND to LBEG
is also encoded in the cs_base: it's encoded literally when less than
256 or as 0 otherwise. This allows for TB chaining for the loopback
branch at the end of a loop for the most common loop sizes.

With this change the resulting emulation speed is about 10% higher in
softmmu mode on uClibc-ng and LTP tests. Emulation speed in linux
user mode is a few percent lower because there's no direct TB chaining
between different memory pages. Testing with lower limit on direct TB
chainig range shows gradual slowdown to ~15% for the block size of 64
bytes and ~50% for the block size of 32 bytes.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
---
 target/xtensa/cpu.h          | 21 +++++++++++++++
 target/xtensa/helper.h       |  2 --
 target/xtensa/op_helper.c    | 24 -----------------
 target/xtensa/overlay_tool.h |  1 +
 target/xtensa/translate.c    | 61 +++++++++++++++-----------------------------
 5 files changed, 42 insertions(+), 67 deletions(-)
Richard Henderson - Jan. 11, 2019, 8:53 p.m.
On 1/11/19 10:49 PM, Max Filippov wrote:
> +#define LINKABLE_BITS 12
> +#define LINKABLE_SIZE (1u << LINKABLE_BITS)
> +#define LINKABLE_MASK (-LINKABLE_SIZE)

What is this for?  It seems to be replicating TARGET_PAGE_BITS.


r~
Max Filippov - Jan. 11, 2019, 8:59 p.m.
On Fri, Jan 11, 2019 at 12:53 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 1/11/19 10:49 PM, Max Filippov wrote:
> > +#define LINKABLE_BITS 12
> > +#define LINKABLE_SIZE (1u << LINKABLE_BITS)
> > +#define LINKABLE_MASK (-LINKABLE_SIZE)
>
> What is this for?  It seems to be replicating TARGET_PAGE_BITS.

I used it to play with different sizes of regions where direct TB chaining
is allowed. In linux-user it is possible to have it bigger than the page size.
It looks like the optimum is somewhere in the range 10..12.
I left it there because it looks logically independent from the page size,
but I sure can drop it.
Richard Henderson - Jan. 11, 2019, 9:11 p.m.
On 1/11/19 10:49 PM, Max Filippov wrote:
> @@ -706,6 +716,17 @@ static inline void cpu_get_tb_cpu_state(CPUXtensaState *env, target_ulong *pc,
>      *flags |= xtensa_get_ring(env);
>      if (env->sregs[PS] & PS_EXCM) {
>          *flags |= XTENSA_TBFLAG_EXCM;
> +    } else if (xtensa_option_enabled(env->config, XTENSA_OPTION_LOOP)) {
> +        target_ulong lend_dist = env->sregs[LEND] - (env->pc & LINKABLE_MASK);
> +
> +        if (lend_dist < LINKABLE_SIZE + env->config->max_insn_size) {
> +            target_ulong lbeg_off = env->sregs[LEND] - env->sregs[LBEG];
> +
> +            *cs_base = lend_dist;
> +            if (lbeg_off < 256) {
> +                *cs_base |= lbeg_off << XTENSA_CSBASE_LBEG_OFF_SHIFT;
> +            }
> +        }
>      }
>      if (xtensa_option_enabled(env->config, XTENSA_OPTION_EXTENDED_L32R) &&
>              (env->sregs[LITBASE] & 1)) {

I think the only other thing that would be nice is some comment that describes
the loop scheme.  I can follow it now, but it took a while of re-reading.  In
particular, the fact that 0 means "disabled", and happens to work because we
(correctly) only check for LEND at the end of an instruction.  Thus the offset
from pc_first will always be non-zero when we check.


r~

Patch

diff --git a/target/xtensa/cpu.h b/target/xtensa/cpu.h
index 34e5ccd9f1d6..22851d2350a7 100644
--- a/target/xtensa/cpu.h
+++ b/target/xtensa/cpu.h
@@ -400,6 +400,7 @@  struct XtensaConfig {
     int excm_level;
     int ndepc;
     unsigned inst_fetch_width;
+    unsigned max_insn_size;
     uint32_t vecbase;
     uint32_t exception_vector[EXC_MAX];
     unsigned ninterrupt;
@@ -695,6 +696,15 @@  static inline int cpu_mmu_index(CPUXtensaState *env, bool ifetch)
 #define XTENSA_TBFLAG_CALLINC_MASK 0x180000
 #define XTENSA_TBFLAG_CALLINC_SHIFT 19
 
+#define XTENSA_CSBASE_LEND_MASK 0x0000ffff
+#define XTENSA_CSBASE_LEND_SHIFT 0
+#define XTENSA_CSBASE_LBEG_OFF_MASK 0x00ff0000
+#define XTENSA_CSBASE_LBEG_OFF_SHIFT 16
+
+#define LINKABLE_BITS 12
+#define LINKABLE_SIZE (1u << LINKABLE_BITS)
+#define LINKABLE_MASK (-LINKABLE_SIZE)
+
 static inline void cpu_get_tb_cpu_state(CPUXtensaState *env, target_ulong *pc,
         target_ulong *cs_base, uint32_t *flags)
 {
@@ -706,6 +716,17 @@  static inline void cpu_get_tb_cpu_state(CPUXtensaState *env, target_ulong *pc,
     *flags |= xtensa_get_ring(env);
     if (env->sregs[PS] & PS_EXCM) {
         *flags |= XTENSA_TBFLAG_EXCM;
+    } else if (xtensa_option_enabled(env->config, XTENSA_OPTION_LOOP)) {
+        target_ulong lend_dist = env->sregs[LEND] - (env->pc & LINKABLE_MASK);
+
+        if (lend_dist < LINKABLE_SIZE + env->config->max_insn_size) {
+            target_ulong lbeg_off = env->sregs[LEND] - env->sregs[LBEG];
+
+            *cs_base = lend_dist;
+            if (lbeg_off < 256) {
+                *cs_base |= lbeg_off << XTENSA_CSBASE_LBEG_OFF_SHIFT;
+            }
+        }
     }
     if (xtensa_option_enabled(env->config, XTENSA_OPTION_EXTENDED_L32R) &&
             (env->sregs[LITBASE] & 1)) {
diff --git a/target/xtensa/helper.h b/target/xtensa/helper.h
index 10153c245360..2ebba0b2c2bf 100644
--- a/target/xtensa/helper.h
+++ b/target/xtensa/helper.h
@@ -12,8 +12,6 @@  DEF_HELPER_2(rotw, void, env, i32)
 DEF_HELPER_3(window_check, noreturn, env, i32, i32)
 DEF_HELPER_1(restore_owb, void, env)
 DEF_HELPER_2(movsp, void, env, i32)
-DEF_HELPER_2(wsr_lbeg, void, env, i32)
-DEF_HELPER_2(wsr_lend, void, env, i32)
 #ifndef CONFIG_USER_ONLY
 DEF_HELPER_1(simcall, void, env)
 #endif
diff --git a/target/xtensa/op_helper.c b/target/xtensa/op_helper.c
index e4b42ab3e56c..078aeb6c2c94 100644
--- a/target/xtensa/op_helper.c
+++ b/target/xtensa/op_helper.c
@@ -107,13 +107,6 @@  static void tb_invalidate_virtual_addr(CPUXtensaState *env, uint32_t vaddr)
     }
 }
 
-#else
-
-static void tb_invalidate_virtual_addr(CPUXtensaState *env, uint32_t vaddr)
-{
-    tb_invalidate_phys_addr(vaddr);
-}
-
 #endif
 
 void HELPER(exception)(CPUXtensaState *env, uint32_t excp)
@@ -370,23 +363,6 @@  void HELPER(movsp)(CPUXtensaState *env, uint32_t pc)
     }
 }
 
-void HELPER(wsr_lbeg)(CPUXtensaState *env, uint32_t v)
-{
-    if (env->sregs[LBEG] != v) {
-        tb_invalidate_virtual_addr(env, env->sregs[LEND] - 1);
-        env->sregs[LBEG] = v;
-    }
-}
-
-void HELPER(wsr_lend)(CPUXtensaState *env, uint32_t v)
-{
-    if (env->sregs[LEND] != v) {
-        tb_invalidate_virtual_addr(env, env->sregs[LEND] - 1);
-        env->sregs[LEND] = v;
-        tb_invalidate_virtual_addr(env, env->sregs[LEND] - 1);
-    }
-}
-
 void HELPER(dump_state)(CPUXtensaState *env)
 {
     XtensaCPU *cpu = xtensa_env_get_cpu(env);
diff --git a/target/xtensa/overlay_tool.h b/target/xtensa/overlay_tool.h
index ee37a04a176c..12609a0d0c1e 100644
--- a/target/xtensa/overlay_tool.h
+++ b/target/xtensa/overlay_tool.h
@@ -457,6 +457,7 @@ 
     .nareg = XCHAL_NUM_AREGS, \
     .ndepc = (XCHAL_XEA_VERSION >= 2), \
     .inst_fetch_width = XCHAL_INST_FETCH_WIDTH, \
+    .max_insn_size = XCHAL_MAX_INSTRUCTION_SIZE, \
     EXCEPTIONS_SECTION, \
     INTERRUPTS_SECTION, \
     TLB_SECTION, \
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index 46e13384488e..f200d785550e 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -53,7 +53,7 @@  struct DisasContext {
     uint32_t pc;
     int cring;
     int ring;
-    uint32_t lbeg;
+    uint32_t lbeg_off;
     uint32_t lend;
 
     bool sar_5bit;
@@ -390,11 +390,9 @@  static void gen_jump(DisasContext *dc, TCGv dest)
 static void gen_jumpi(DisasContext *dc, uint32_t dest, int slot)
 {
     TCGv_i32 tmp = tcg_const_i32(dest);
-#ifndef CONFIG_USER_ONLY
-    if (((dc->base.pc_first ^ dest) & TARGET_PAGE_MASK) != 0) {
+    if (((dc->base.pc_first ^ dest) & LINKABLE_MASK) != 0) {
         slot = -1;
     }
-#endif
     gen_jump_slot(dc, tmp, slot);
     tcg_temp_free(tmp);
 }
@@ -420,25 +418,25 @@  static void gen_callw(DisasContext *dc, int callinc, TCGv_i32 dest)
 static void gen_callwi(DisasContext *dc, int callinc, uint32_t dest, int slot)
 {
     TCGv_i32 tmp = tcg_const_i32(dest);
-#ifndef CONFIG_USER_ONLY
-    if (((dc->base.pc_first ^ dest) & TARGET_PAGE_MASK) != 0) {
+    if (((dc->base.pc_first ^ dest) & LINKABLE_MASK) != 0) {
         slot = -1;
     }
-#endif
     gen_callw_slot(dc, callinc, tmp, slot);
     tcg_temp_free(tmp);
 }
 
 static bool gen_check_loop_end(DisasContext *dc, int slot)
 {
-    if (option_enabled(dc, XTENSA_OPTION_LOOP) &&
-            !(dc->base.tb->flags & XTENSA_TBFLAG_EXCM) &&
-            dc->base.pc_next == dc->lend) {
+    if (dc->base.pc_next == dc->lend) {
         TCGLabel *label = gen_new_label();
 
         tcg_gen_brcondi_i32(TCG_COND_EQ, cpu_SR[LCOUNT], 0, label);
         tcg_gen_subi_i32(cpu_SR[LCOUNT], cpu_SR[LCOUNT], 1);
-        gen_jumpi(dc, dc->lbeg, slot);
+        if (dc->lbeg_off) {
+            gen_jumpi(dc, dc->base.pc_next - dc->lbeg_off, slot);
+        } else {
+            gen_jump(dc, cpu_SR[LBEG]);
+        }
         gen_set_label(label);
         gen_jumpi(dc, dc->base.pc_next, -1);
         return true;
@@ -534,16 +532,6 @@  static void gen_rsr(DisasContext *dc, TCGv_i32 d, uint32_t sr)
     }
 }
 
-static void gen_wsr_lbeg(DisasContext *dc, uint32_t sr, TCGv_i32 s)
-{
-    gen_helper_wsr_lbeg(cpu_env, s);
-}
-
-static void gen_wsr_lend(DisasContext *dc, uint32_t sr, TCGv_i32 s)
-{
-    gen_helper_wsr_lend(cpu_env, s);
-}
-
 static void gen_wsr_sar(DisasContext *dc, uint32_t sr, TCGv_i32 s)
 {
     tcg_gen_andi_i32(cpu_SR[sr], s, 0x3f);
@@ -743,8 +731,6 @@  static void gen_wsr(DisasContext *dc, uint32_t sr, TCGv_i32 s)
 {
     static void (* const wsr_handler[256])(DisasContext *dc,
                                            uint32_t sr, TCGv_i32 v) = {
-        [LBEG] = gen_wsr_lbeg,
-        [LEND] = gen_wsr_lend,
         [SAR] = gen_wsr_sar,
         [BR] = gen_wsr_br,
         [LITBASE] = gen_wsr_litbase,
@@ -906,13 +892,6 @@  static void disas_xtensa_insn(CPUXtensaState *env, DisasContext *dc)
     }
 
     dc->base.pc_next = dc->pc + len;
-    if (xtensa_option_enabled(dc->config, XTENSA_OPTION_LOOP) &&
-        dc->lbeg == dc->pc &&
-        ((dc->pc ^ (dc->base.pc_next - 1)) & -dc->config->inst_fetch_width)) {
-        qemu_log_mask(LOG_GUEST_ERROR,
-                      "unaligned first instruction of a loop (pc = %08x)\n",
-                      dc->pc);
-    }
     for (i = 1; i < len; ++i) {
         b[i] = cpu_ldub_code(env, dc->pc + i);
     }
@@ -1097,8 +1076,10 @@  static void xtensa_tr_init_disas_context(DisasContextBase *dcbase,
     dc->pc = dc->base.pc_first;
     dc->ring = tb_flags & XTENSA_TBFLAG_RING_MASK;
     dc->cring = (tb_flags & XTENSA_TBFLAG_EXCM) ? 0 : dc->ring;
-    dc->lbeg = env->sregs[LBEG];
-    dc->lend = env->sregs[LEND];
+    dc->lbeg_off = (dc->base.tb->cs_base & XTENSA_CSBASE_LBEG_OFF_MASK) >>
+        XTENSA_CSBASE_LBEG_OFF_SHIFT;
+    dc->lend = (dc->base.tb->cs_base & XTENSA_CSBASE_LEND_MASK) +
+        (dc->base.pc_first & LINKABLE_MASK);
     dc->debug = tb_flags & XTENSA_TBFLAG_DEBUG;
     dc->icount = tb_flags & XTENSA_TBFLAG_ICOUNT;
     dc->cpenable = (tb_flags & XTENSA_TBFLAG_CPENABLE_MASK) >>
@@ -1189,10 +1170,10 @@  static void xtensa_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
     }
 
     /* End the TB if the next insn will cross into the next page.  */
-    page_start = dc->base.pc_first & TARGET_PAGE_MASK;
+    page_start = dc->base.pc_first & LINKABLE_MASK;
     if (dc->base.is_jmp == DISAS_NEXT &&
         (dc->pc - page_start >= TARGET_PAGE_SIZE ||
-         dc->pc - page_start + xtensa_insn_len(env, dc) > TARGET_PAGE_SIZE)) {
+         dc->pc - page_start + xtensa_insn_len(env, dc) > LINKABLE_SIZE)) {
         dc->base.is_jmp = DISAS_TOO_MANY;
     }
 }
@@ -1712,12 +1693,10 @@  static void translate_loop(DisasContext *dc, const uint32_t arg[],
                            const uint32_t par[])
 {
     uint32_t lend = arg[1];
-    TCGv_i32 tmp = tcg_const_i32(lend);
 
     tcg_gen_subi_i32(cpu_SR[LCOUNT], cpu_R[arg[0]], 1);
     tcg_gen_movi_i32(cpu_SR[LBEG], dc->base.pc_next);
-    gen_helper_wsr_lend(cpu_env, tmp);
-    tcg_temp_free(tmp);
+    tcg_gen_movi_i32(cpu_SR[LEND], lend);
 
     if (par[0] != TCG_COND_NEVER) {
         TCGLabel *label = gen_new_label();
@@ -4609,7 +4588,7 @@  static const XtensaOpcodeOps core_ops[] = {
         .translate = translate_wsr,
         .test_ill = test_ill_wsr,
         .par = (const uint32_t[]){LBEG},
-        .op_flags = XTENSA_OP_EXIT_TB_0,
+        .op_flags = XTENSA_OP_EXIT_TB_M1,
         .windowed_register_op = 0x1,
     }, {
         .name = "wsr.lcount",
@@ -4622,7 +4601,7 @@  static const XtensaOpcodeOps core_ops[] = {
         .translate = translate_wsr,
         .test_ill = test_ill_wsr,
         .par = (const uint32_t[]){LEND},
-        .op_flags = XTENSA_OP_EXIT_TB_0,
+        .op_flags = XTENSA_OP_EXIT_TB_M1,
         .windowed_register_op = 0x1,
     }, {
         .name = "wsr.litbase",
@@ -5183,7 +5162,7 @@  static const XtensaOpcodeOps core_ops[] = {
         .translate = translate_xsr,
         .test_ill = test_ill_xsr,
         .par = (const uint32_t[]){LBEG},
-        .op_flags = XTENSA_OP_EXIT_TB_0,
+        .op_flags = XTENSA_OP_EXIT_TB_M1,
         .windowed_register_op = 0x1,
     }, {
         .name = "xsr.lcount",
@@ -5196,7 +5175,7 @@  static const XtensaOpcodeOps core_ops[] = {
         .translate = translate_xsr,
         .test_ill = test_ill_xsr,
         .par = (const uint32_t[]){LEND},
-        .op_flags = XTENSA_OP_EXIT_TB_0,
+        .op_flags = XTENSA_OP_EXIT_TB_M1,
         .windowed_register_op = 0x1,
     }, {
         .name = "xsr.litbase",