Performance monitoring tool.

Squashed commit of the following:

commit f1820af82bb5467d0c79c03290fca809b0273030
Author: philip <philip@gladstonefamily.net>
Date:   Sun Feb 21 15:08:31 2016 -0500

    Now uses userdata

commit 74a2298f5f2d2b07097a9501046efb8d4061ec5e
Merge: 4ffab15 716e682
Author: philip <philip@gladstonefamily.net>
Date:   Sun Feb 21 13:54:40 2016 -0500

    Merge remote-tracking branch 'upstream/dev' into performance

    Conflicts:
    	app/platform/hw_timer.c
    	app/platform/hw_timer.h

commit 4ffab15a2a15e0c6b2d7e93611a02be47bafdc79
Author: philip <philip@gladstonefamily.net>
Date:   Fri Feb 12 17:36:12 2016 -0500

    Simple low level performance monitoring tool

    Make it work with the new hw_timer code

commit 944db2bdb8
Author: philip <philip@gladstonefamily.net>
Date:   Sun Feb 14 10:32:41 2016 -0500

    Initial version of the hw_timer as part of the platform

Addressed review comments

Add the binsize return
This commit is contained in:
philip 2016-02-21 15:12:13 -05:00
parent 716e6824db
commit e516a0e9a2
4 changed files with 215 additions and 0 deletions

View File

@ -32,6 +32,7 @@
#define LUA_USE_MODULES_NET
#define LUA_USE_MODULES_NODE
#define LUA_USE_MODULES_OW
//#define LUA_USE_MODULES_PERF
#define LUA_USE_MODULES_PWM
#define LUA_USE_MODULES_RC
#define LUA_USE_MODULES_RTCFIFO

147
app/modules/perf.c Normal file
View File

@ -0,0 +1,147 @@
//
// This module allows performance monitoring by looking at
// the PC at regular intervals and building a histogram
//
// perf.start(start, end, nbins[, pc offset on stack])
// perf.stop() -> total sample, samples outside range, table { addr -> count , .. }
#include "ets_sys.h"
#include "os_type.h"
#include "osapi.h"
#include "c_stdlib.h"
#include "module.h"
#include "lauxlib.h"
#include "platform.h"
#include "hw_timer.h"
#include "cpu_esp8266.h"
typedef struct {
int ref;
uint32_t start;
uint32_t bucket_shift;
uint32_t bucket_count;
uint32_t total_samples;
uint32_t outside_samples;
uint32_t pc_offset;
uint32_t bucket[1];
} DATA;
static DATA *data;
extern char _flash_used_end[];
#define TIMER_OWNER ((os_param_t) 'p')
static void ICACHE_RAM_ATTR hw_timer_cb(os_param_t p)
{
(void) p;
uint32_t stackaddr;
if (data) {
uint32_t pc = *(&stackaddr + data->pc_offset);
uint32_t bucket_number = (pc - data->start) >> data->bucket_shift;
if (bucket_number < data->bucket_count) {
data->bucket[bucket_number]++;
} else {
data->outside_samples++;
}
data->total_samples++;
}
}
static int perf_start(lua_State *L)
{
uint32_t start = luaL_optinteger(L, 1, 0x40000000);
uint32_t end = luaL_optinteger(L, 2, (uint32_t) _flash_used_end);
uint32_t bins = luaL_optinteger(L, 3, 1024);
if (end <= start) {
luaL_error(L, "end must be larger than start");
}
uint32_t binsize = (end - start + bins - 1) / bins;
// Round up to a power of two
int shift;
binsize = binsize - 1;
for (shift = 0; binsize > 0; shift++) {
binsize >>= 1;
}
bins = (end - start + (1 << shift) - 1) / (1 << shift);
int pc_offset = 20; // This appears to be correct
if (lua_gettop(L) >= 4) {
pc_offset = luaL_checkinteger(L, 4);
}
size_t data_size = sizeof(DATA) + bins * sizeof(uint32_t);
DATA *d = (DATA *) lua_newuserdata(L, data_size);
memset(d, 0, data_size);
d->ref = luaL_ref(L, LUA_REGISTRYINDEX);
d->start = start;
d->bucket_shift = shift;
d->bucket_count = bins;
d->pc_offset = pc_offset;
if (data) {
lua_unref(L, data->ref);
}
data = d;
// Start the timer
if (!platform_hw_timer_init(TIMER_OWNER, NMI_SOURCE, TRUE)) {
// Failed to init the timer
data = NULL;
lua_unref(L, d->ref);
luaL_error(L, "Unable to initialize timer");
}
platform_hw_timer_set_func(TIMER_OWNER, hw_timer_cb, 0);
platform_hw_timer_arm_us(TIMER_OWNER, 50);
return 0;
}
static int perf_stop(lua_State *L)
{
if (!data) {
return 0;
}
// stop the timer
platform_hw_timer_close(TIMER_OWNER);
DATA *d = data;
data = NULL;
lua_pushnumber(L, d->total_samples);
lua_pushnumber(L, d->outside_samples);
lua_newtable(L);
int i;
uint32_t addr = d->start;
for (i = 0; i < d->bucket_count; i++, addr += (1 << d->bucket_shift)) {
if (d->bucket[i]) {
lua_pushnumber(L, addr);
lua_pushnumber(L, d->bucket[i]);
lua_settable(L, -3);
}
}
lua_pushnumber(L, 1 << d->bucket_shift);
lua_unref(L, d->ref);
return 4;
}
static const LUA_REG_TYPE perf_map[] = {
{ LSTRKEY( "start" ), LFUNCVAL( perf_start ) },
{ LSTRKEY( "stop" ), LFUNCVAL( perf_stop ) },
{ LNILKEY, LNILVAL }
};
NODEMCU_MODULE(PERF, "perf", perf_map, NULL);

66
docs/en/modules/perf.md Normal file
View File

@ -0,0 +1,66 @@
# perf Module
This module provides simple performance measurement for an application.
It samples the program counter roughly every 50 microseconds and builds
a histogram of the values that it finds. Since there is only a small amount
of memory to store the histogram, the user can specify which area of code
is of interest. The default is the enitre flash which contains code. Once the hotspots are
identified, then the run can then be repeated with different areas and at different
resolutions to get as much information as required.
## perf.start()
Starts a performance monitoring session.
#### Syntax
`perf.start([start[, end[, nbins[, offset]]]])`
#### Parameters
- `start` (optional) The lowest PC address for the histogram. Default is 0x40000000.
- `end` (optional) The highest address for the histogram. Default is the end of the used space in the flash memory.
- `nbins` (optional) The number of bins in the histogram. Keep this reasonable otherwise
you will run out of memory. Default is 1024.
- `offset` (Very optional) This specifies the offset of the saved PC value
on the interrupt stack. It appears that 20 is the correct value.
Note that the number of bins is an upper limit. The size of each bin is set to be the smallest power of two
such that the number of bins required is less than or equal to the provided number of bins.
#### Returns
Nothing
## perf.stop()
Terminates a performance monitoring session and returns the histogram.
#### Syntax
`total, outside, histogram, binsize = perf.stop()`
#### Returns
- `total` The total number of samples captured in this run
- `outside` The number of samples that were outside the histogram range
- `histogram` The histogram represented as a table indexed by address where the value is the number of samples. The address is the lowest address for the bin.
- `binsize` The number of bytes per histogram bin.
### Example
perf.start()
for j = 0, 100 do
str = "str"..j
end
tot, out, tbl, binsize = perf.stop()
print(tot, out)
local keyset = {}
local n = 0
for k,v in pairs(tbl) do
n=n+1
keyset[n]=k
end
table.sort(keyset)
for kk,k in ipairs(keyset) do print(string.format("%x - %x",k, k + binsize - 1),tbl[k]) end
This runs a loop creating strings 100 times and then prints out the histogram (after sorting it).
This takes around 2,500 samples and provides a good indication of where all the CPU time is
being spent.

View File

@ -52,6 +52,7 @@ pages:
- 'net': 'en/modules/net.md'
- 'node': 'en/modules/node.md'
- 'ow (1-Wire)': 'en/modules/ow.md'
- 'perf': 'en/modules/perf.md'
- 'pwm' : 'en/modules/pwm.md'
- 'rtcmem': 'en/modules/rtcmem.md'
- 'rtctime': 'en/modules/rtctime.md'