How perf measure instructions and branch misses on e500

cancel
Showing results for 
Search instead for 
Did you mean: 

How perf measure instructions and branch misses on e500

Jump to solution
1,133 Views
NXP Employee
NXP Employee

I have run the following command on T4240 board.

#perf stat ls

Performance counter stats for 'ls':

          2.712256 task-clock                #    0.744 CPUs utilized

                 1 context-switches          #    0.369 K/sec

                 0 cpu-migrations            #    0.000 K/sec

               241 page-faults               #    0.089 M/sec

                 0 cycles                    #    0.000 GHz

   <not supported> stalled-cycles-frontend

   <not supported> stalled-cycles-backend

                 0 instructions              #    0.00  insns per cycle

                 0 branches                  #    0.000 K/sec

                 0 branch-misses             #    0.00% of all branches

       0.003646108 seconds time elapsed

Would like to know how perf measure instruction,branches and branch-misses,why 0 for all these three fields?

Which particular registers perf reads to know about count of instruction,branches and branch-misses?


Thanks

Amit

1 Solution
24 Views
NXP Employee
NXP Employee

You'll need to do more than that to make perfmon work in a KVM guest.  You'll also need to clear MSRP[PMMP] so that the guest can use MSR[PMM], and handle perfmon interrupt reflection.  The latter is tricky because you need to mask the interrupt until the guest has cleared it, but MSRP[PMMP] causes mfpmr/mtpmr to not trap, so the guest will see what you've done to mask the interrupt.  Look at the perfmon code in the Freescale Embedded Hypervisor (sdk/hypervisor/hypervisor.git - Embedded hypervisor for PowerPC) for an example of how to deal with this.

View solution in original post

11 Replies
24 Views
NXP Employee
NXP Employee

I tried "perf stat ls" on an e6500 with both the latest SDK kernel and a recent upstream kernel, and got non-zero values for instructions, branches, and branch-misses.  What kernel are you using?  Can you post the config?

As for how this information is read at the hardware level, see chapter 9 "Debug and Performance Monitor Facilities" of the e6500 core reference manual.  To see the specific event numbers being used for each event, look at arch/powerpc/perf/e6500-pmu.c.

0 Kudos
24 Views
NXP Employee
NXP Employee

Sorry @Scott,didn't mention that I was running this command on guest.Actually I come up with below set of patches to emulate mfpmr/mtpmr in KVM so that perf can use these Hardware counters.

--- a/arch/powerpc/include/asm/ppc-opcode.h

+++ b/arch/powerpc/include/asm/ppc-opcode.h

@@ -98,11 +98,13 @@

#define OP_31_XOP_STBUX     247

#define OP_31_XOP_LHZX      279

#define OP_31_XOP_LHZUX     311

+#define OP_31_XOP_MFPMR     334

#define OP_31_XOP_MFSPR     339

#define OP_31_XOP_LHAX      343

#define OP_31_XOP_LHAUX     375

#define OP_31_XOP_STHX      407

#define OP_31_XOP_STHUX     439

+#define OP_31_XOP_MTPMR     462    

#define OP_31_XOP_MTSPR     467

#define OP_31_XOP_DCBI      470

#define OP_31_XOP_LWBRX     534

##############################################################

--- a/arch/powerpc/include/asm/disassemble.h

+++ b/arch/powerpc/include/asm/disassemble.h

@@ -36,6 +36,11 @@ static inline unsigned int get_sprn(u32 inst)

{

        return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);

}

+/* PMRN is in the same location as SPRN  PMR(11:20)==SPR(11:20) */

+static inline unsigned int get_pmrn(u32 inst)

+{

+        return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);

+}

static inline unsigned int get_dcrn(u32 inst)

{

##################################################

--- a/arch/powerpc/include/uapi/asm/kvm_para.h

+++ b/arch/powerpc/include/uapi/asm/kvm_para.h

@@ -56,6 +56,12 @@ struct kvm_vcpu_arch_shared {

        __u32 mas6;

        __u32 esr;

        __u32 pir;

+        __u32 pmc0;

+        __u32 pmc1;

+        __u32 pmc2;

+        __u32 pmc3;

+        __u32 upmc0;

+        __u32 upmc1;

        /*

         * SPRG4-7 are user-readable, so we can only keep these consistent

###################################################

--- a/arch/powerpc/kvm/emulate.c

+++ b/arch/powerpc/kvm/emulate.c

@@ -31,6 +31,7 @@

#include <asm/kvm_ppc.h>

#include <asm/disassemble.h>

#include <asm/ppc-opcode.h>

+#include <asm/reg_fsl_emb.h>

#include "timing.h"

#include "trace.h"

@@ -90,6 +91,74 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)

        return vcpu->arch.dec - jd;

}

+static int kvmppc_emulate_mtpmr(struct kvm_vcpu *vcpu, int pmrn, int rs)

+{

+

+       enum emulation_result emulated = EMULATE_DONE;

+       ulong pmr_val = kvmppc_get_gpr(vcpu, rs);

+

+       switch(pmrn)  {

+       case PMRN_PMC0:

+               vcpu->arch.shared->pmc0 = pmr_val;

+               break;

+       case PMRN_PMC1:

+               vcpu->arch.shared->pmc1 = pmr_val;

+               break;

+       case PMRN_PMC2:

+               vcpu->arch.shared->pmc2 = pmr_val;

+               break;

+       case PMRN_PMC3:

+               vcpu->arch.shared->pmc3 = pmr_val;

+               break;

+       case PMRN_UPMC0:

+               vcpu->arch.shared->upmc0 = pmr_val;

+               break;

+      case PMRN_UPMC1:

+               vcpu->arch.shared->upmc1 = pmr_val;

+               break;

+

+     }

+       kvmppc_set_exit_type(vcpu, EMULATED_MTPMR_EXITS);

+

+       return emulated;

+}

+

+static int kvmppc_emulate_mfpmr(struct kvm_vcpu *vcpu, int pmrn, int rt)

+{

+

+       enum emulation_result emulated = EMULATE_DONE;

+       ulong pmr_val = 0;

+

+       switch(pmrn)  {

+       case PMRN_PMC0:

+                pmr_val = vcpu->arch.shared->pmc0;

+                break;

+       case PMRN_PMC1:

+                pmr_val = vcpu->arch.shared->pmc1;

+                break;

+      case PMRN_PMC2:

+                pmr_val = vcpu->arch.shared->pmc2;

+                break;

+      case PMRN_PMC3:

+                pmr_val = vcpu->arch.shared->pmc3;

+                break;

+      case PMRN_UPMC0:

+                pmr_val = vcpu->arch.shared->upmc0;

+                break;

+      case PMRN_UPMC1:

+                pmr_val = vcpu->arch.shared->upmc1;

+                break;

+

+       }

+       kvmppc_set_gpr(vcpu, rt, pmr_val);

+       kvmppc_set_exit_type(vcpu, EMULATED_MTPMR_EXITS);

+      

+

+       return emulated;

+}

+

+

+

static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)

{

        enum emulation_result emulated = EMULATE_DONE;

@@ -212,7 +281,7 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)

int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)

{

        u32 inst;

-       int rs, rt, sprn;

+       int rs, rt, sprn,pmrn;

        enum emulation_result emulated;

        int advance = 1;

@@ -228,6 +297,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)

        rs = get_rs(inst);

        rt = get_rt(inst);

        sprn = get_sprn(inst);

+       pmrn = get_pmrn(inst);

        switch (get_op(inst)) {

        case OP_TRAP:

@@ -257,10 +327,19 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)

                        advance = 0;

                        break;

-               case OP_31_XOP_MFSPR:

+

+                case OP_31_XOP_MFPMR:

+                        emulated = kvmppc_emulate_mfpmr(vcpu, pmrn, rt);

+                        break;

+              

+                case OP_31_XOP_MFSPR:

                        emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);

                        break;

+                case OP_31_XOP_MTPMR:

+                        emulated = kvmppc_emulate_mtpmr(vcpu, pmrn, rs);

+                        break;

+

                case OP_31_XOP_MTSPR:

                        emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);

                        break;

#################################################################################################

--- a/arch/powerpc/kvm/timing.c

+++ b/arch/powerpc/kvm/timing.c

@@ -121,6 +121,8 @@ static const char *kvm_exit_names[__NUMBER_OF_KVM_EXIT_TYPES

        [EMULATED_INST_EXITS] =     "EMULINST",

        [EMULATED_MTMSRWE_EXITS] =  "EMUL_WAIT",

        [EMULATED_WRTEE_EXITS] =    "EMUL_WRTEE",

+        [EMULATED_MTPMR_EXITS] =    "EMUL_MTPMR",

+        [EMULATED_MFPMR_EXITS] =    "EMUL_MFPMR",

        [EMULATED_MTSPR_EXITS] =    "EMUL_MTSPR",

        [EMULATED_MFSPR_EXITS] =    "EMUL_MFSPR",

        [EMULATED_MTMSR_EXITS] =    "EMUL_MTMSR",

####################################################################################################

--- a/arch/powerpc/include/asm/kvm_host.h

+++ b/arch/powerpc/include/asm/kvm_host.h

@@ -136,6 +136,8 @@ enum kvm_exit_types {

        EMULATED_INST_EXITS,

        EMULATED_MTMSRWE_EXITS,

        EMULATED_WRTEE_EXITS,

+        EMULATED_MTPMR_EXITS,

+        EMULATED_MFPMR_EXITS,

        EMULATED_MTSPR_EXITS,

        EMULATED_MFSPR_EXITS,

        EMULATED_MTMSR_EXITS,

But I'm not sure, "perf stat ls" below output(after applying theses patche) is expected one?

#perf stat ls

Performance counter stats for 'ls':

          2.712256 task-clock                #    0.744 CPUs utilized

                 1 context-switches          #    0.369 K/sec

                 0 cpu-migrations            #    0.000 K/sec

               241 page-faults               #    0.089 M/sec

                 0 cycles                    #    0.000 GHz

   <not supported> stalled-cycles-frontend

   <not supported> stalled-cycles-backend

                 0 instructions              #    0.00  insns per cycle

                 0 branches                  #    0.000 K/sec

                 0 branch-misses             #    0.00% of all branches

       0.003646108 seconds time elapsed

Thanks,

Amit.

0 Kudos
25 Views
NXP Employee
NXP Employee

You'll need to do more than that to make perfmon work in a KVM guest.  You'll also need to clear MSRP[PMMP] so that the guest can use MSR[PMM], and handle perfmon interrupt reflection.  The latter is tricky because you need to mask the interrupt until the guest has cleared it, but MSRP[PMMP] causes mfpmr/mtpmr to not trap, so the guest will see what you've done to mask the interrupt.  Look at the perfmon code in the Freescale Embedded Hypervisor (sdk/hypervisor/hypervisor.git - Embedded hypervisor for PowerPC) for an example of how to deal with this.

View solution in original post

24 Views
NXP Employee
NXP Employee

Thanks @Scott, have incorporated your suggested comments and can see below output from "perf stat ls" on guest.

root@model:~# perf stat ls

perf stat ls

Performance counter stats for 'ls':

          2.781888 task-clock                #    0.678 CPUs utilized

                 0 context-switches          #    0.000 K/sec

                 0 cpu-migrations            #    0.000 K/sec

               241 page-faults               #    0.087 M/sec

           4608872 cycles                    #    1.657 GHz

   <not supported> stalled-cycles-frontend

   <not supported> stalled-cycles-backend

           2335880 instructions              #    0.51  insns per cycle

            367221 branches                  #  132.004 M/sec

             31594 branch-misses             #    8.60% of all branches

       0.004103946 seconds time elapsed

Are you OK with it,is this output expected one?

Thanks,

Amit

0 Kudos
24 Views
NXP Employee
NXP Employee

Yes, it looks reasonable.

0 Kudos
24 Views
NXP Employee
NXP Employee

Ok, Thanks.

Also,Would you please review the changes done as per your suggestion?

--- a/arch/powerpc/kvm/emulate.c

+++ b/arch/powerpc/kvm/emulate.c

@@ -31,6 +31,8 @@

#include <asm/kvm_ppc.h>

#include <asm/disassemble.h>

#include <asm/ppc-opcode.h>

+#include <asm/reg_fsl_emb.h>

+#include <asm/reg_booke.h>

#include "timing.h"

#include "trace.h"

@@ -90,6 +92,88 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)

     return vcpu->arch.dec - jd;

}

+static int kvmppc_emulate_mtpmr(struct kvm_vcpu *vcpu, int pmrn, int rs)

+{

+

+       enum emulation_result emulated = EMULATE_DONE;

+       ulong pmr_val = kvmppc_get_gpr(vcpu, rs);

+

+       switch(pmrn)  {

+       case PMRN_PMC0:

+               vcpu->arch.shared->pmc0 = pmr_val;

+               break;

+       case PMRN_PMC1:

+               vcpu->arch.shared->pmc1 = pmr_val;

+               break;

+       case PMRN_PMC2:

+               vcpu->arch.shared->pmc2 = pmr_val;

+               break;

+       case PMRN_PMC3:

+               vcpu->arch.shared->pmc3 = pmr_val;

+               break;

+       case PMRN_UPMC0:

+               vcpu->arch.shared->upmc0 = pmr_val;

+               break;

+      case PMRN_UPMC1:

+               vcpu->arch.shared->upmc1 = pmr_val;

+               break;

+      case PMRN_PMGC0:

+                if (mfspr(SPRN_MSRP) & MSRP_PMMP) {

+

+                if (!(pmr_val & PMGC0_PMIE))

+                                mtspr(SPRN_MSRP, mfspr(SPRN_MSRP) & ~MSRP_PMMP);

+

+                        pmr_val &= ~PMGC0_PMIE;

+                 }

+                vcpu->arch.shared->pmgc0 = pmr_val;

+                break;

+

+

+     }

+       kvmppc_set_exit_type(vcpu, EMULATED_MTPMR_EXITS);

+

+       return emulated;

+}

+

+static int kvmppc_emulate_mfpmr(struct kvm_vcpu *vcpu, int pmrn, int rt)

+{

+

+       enum emulation_result emulated = EMULATE_DONE;

+       ulong pmr_val = 0;

+

+       switch(pmrn)  {

+       case PMRN_PMC0:

+                pmr_val = vcpu->arch.shared->pmc0;

+                break;

+       case PMRN_PMC1:

+                pmr_val = vcpu->arch.shared->pmc1;

+                break;

+      case PMRN_PMC2:

+                pmr_val = vcpu->arch.shared->pmc2;

+                break;

+      case PMRN_PMC3:

+                pmr_val = vcpu->arch.shared->pmc3;

+                break;

+      case PMRN_UPMC0:

+                pmr_val = vcpu->arch.shared->upmc0;

+                break;

+      case PMRN_UPMC1:

+                pmr_val = vcpu->arch.shared->upmc1;

+                break;

+      case PMRN_PMGC0:

+                pmr_val = vcpu->arch.shared->pmgc0;

+                break;

+

+       }

+       kvmppc_set_exit_type(vcpu, EMULATED_MTPMR_EXITS);

+

+       kvmppc_set_gpr(vcpu, rt, pmr_val);

+

+       return emulated;

+}

+

+

+

static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)

{

     enum emulation_result emulated = EMULATE_DONE;

@@ -212,7 +296,7 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)

int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)

{

     u32 inst;

-    int rs, rt, sprn;

+    int rs, rt, sprn,pmrn;

     enum emulation_result emulated;

     int advance = 1;

@@ -228,6 +312,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)

     rs = get_rs(inst);

     rt = get_rt(inst);

     sprn = get_sprn(inst);

+        pmrn = get_pmrn(inst);

     switch (get_op(inst)) {

     case OP_TRAP:

@@ -257,10 +342,19 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)

             advance = 0;

             break;

-        case OP_31_XOP_MFSPR:

+

+                case OP_31_XOP_MFPMR:

+                        emulated = kvmppc_emulate_mfpmr(vcpu, pmrn, rt);

+                        break;

+      

+                case OP_31_XOP_MFSPR:

             emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);

             break;

+                case OP_31_XOP_MTPMR:

+                        emulated = kvmppc_emulate_mtpmr(vcpu, pmrn, rs);

+                        break;

+

         case OP_31_XOP_MTSPR:

             emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);

             break;

Is these changes fine?

Thanks

Amit.

0 Kudos
24 Views
NXP Employee
NXP Employee

These changes are not as per my suggestion.  As I pointed out in e-mail, they don't handle interrupts at all.

0 Kudos
24 Views
NXP Employee
NXP Employee

Ok @Scott,Thanks for the review comments,I would work on it.

Also,would you please confirm to handle interrupts need to take care pending interrupts for perfmon?

Thanks

Amit.

0 Kudos
24 Views
NXP Employee
NXP Employee

I can't parse that sentence.  What exactly do you want me to confirm?  When a perfmon interrupt comes in, you need to mask it, and reflect it to the guest, while hiding from the guest the fact that it has been masked.  Again, see Topaz for an example.

0 Kudos
24 Views
NXP Employee
NXP Employee

Sorry, I could not implement your suggested changes but I looked into Topaz and Kvm source lot more closely and below is what my Understanding and Queries .

1. Basically, we need to share the Perfmon Interrupt between host and guest. When an actual interrupt comes we would mask it by writing zero to PMGC0_PMIE(Is it because we don't want to handle further interrupts ? ) and disable the guest access to PMR's by writing one to MSRP_PMMP.

But, if above is true then where should I make these changes, is it good to place these changes to static "void kvmppc_restart_interrupt " of "booke.c" under "case BOOKE_INTERRUPT_PERFORMANCE_MONITOR:" ?

2. Earlier I have placed the kvmppc_emulate_mtpmr inside emulate.c ,shall I move it to e500_emulate.c ?

3. Also,I still can't think of a way of using kvm_guest_protect_msr for manipulating MSRP(I understand the fact that we have to do it since e500v2 doesn't have Embedded Hypervisor support) ?

Seeking your guidance ,Please !!!.

Thanks,

Amit.

0 Kudos
24 Views
NXP Employee
NXP Employee

Thanks Scott for your help on this,I would indeed take care of your comments.

0 Kudos