Merge pull request #1041 from pjsg/performance
Simple low level performance monitoring tool as a module
This commit is contained in:
commit
19d3c1d581
|
@ -35,6 +35,7 @@
|
|||
#define LUA_USE_MODULES_NET
|
||||
#define LUA_USE_MODULES_NODE
|
||||
#define LUA_USE_MODULES_OW
|
||||
//#define LUA_USE_MODULES_PERF
|
||||
#define LUA_USE_MODULES_PWM
|
||||
#define LUA_USE_MODULES_RC
|
||||
#define LUA_USE_MODULES_RTCFIFO
|
||||
|
|
|
@ -0,0 +1,147 @@
|
|||
//
|
||||
// This module allows performance monitoring by looking at
|
||||
// the PC at regular intervals and building a histogram
|
||||
//
|
||||
// perf.start(start, end, nbins[, pc offset on stack])
|
||||
// perf.stop() -> total sample, samples outside range, table { addr -> count , .. }
|
||||
|
||||
|
||||
#include "ets_sys.h"
|
||||
#include "os_type.h"
|
||||
#include "osapi.h"
|
||||
#include "c_stdlib.h"
|
||||
|
||||
#include "module.h"
|
||||
#include "lauxlib.h"
|
||||
#include "platform.h"
|
||||
#include "hw_timer.h"
|
||||
#include "cpu_esp8266.h"
|
||||
|
||||
typedef struct {
|
||||
int ref;
|
||||
uint32_t start;
|
||||
uint32_t bucket_shift;
|
||||
uint32_t bucket_count;
|
||||
uint32_t total_samples;
|
||||
uint32_t outside_samples;
|
||||
uint32_t pc_offset;
|
||||
uint32_t bucket[1];
|
||||
} DATA;
|
||||
|
||||
static DATA *data;
|
||||
extern char _flash_used_end[];
|
||||
|
||||
#define TIMER_OWNER ((os_param_t) 'p')
|
||||
|
||||
static void ICACHE_RAM_ATTR hw_timer_cb(os_param_t p)
|
||||
{
|
||||
(void) p;
|
||||
uint32_t stackaddr;
|
||||
|
||||
if (data) {
|
||||
uint32_t pc = *(&stackaddr + data->pc_offset);
|
||||
|
||||
uint32_t bucket_number = (pc - data->start) >> data->bucket_shift;
|
||||
if (bucket_number < data->bucket_count) {
|
||||
data->bucket[bucket_number]++;
|
||||
} else {
|
||||
data->outside_samples++;
|
||||
}
|
||||
data->total_samples++;
|
||||
}
|
||||
}
|
||||
|
||||
static int perf_start(lua_State *L)
|
||||
{
|
||||
uint32_t start = luaL_optinteger(L, 1, 0x40000000);
|
||||
uint32_t end = luaL_optinteger(L, 2, (uint32_t) _flash_used_end);
|
||||
uint32_t bins = luaL_optinteger(L, 3, 1024);
|
||||
|
||||
if (end <= start) {
|
||||
luaL_error(L, "end must be larger than start");
|
||||
}
|
||||
|
||||
uint32_t binsize = (end - start + bins - 1) / bins;
|
||||
|
||||
// Round up to a power of two
|
||||
int shift;
|
||||
binsize = binsize - 1;
|
||||
for (shift = 0; binsize > 0; shift++) {
|
||||
binsize >>= 1;
|
||||
}
|
||||
|
||||
bins = (end - start + (1 << shift) - 1) / (1 << shift);
|
||||
|
||||
int pc_offset = 20; // This appears to be correct
|
||||
if (lua_gettop(L) >= 4) {
|
||||
pc_offset = luaL_checkinteger(L, 4);
|
||||
}
|
||||
|
||||
size_t data_size = sizeof(DATA) + bins * sizeof(uint32_t);
|
||||
DATA *d = (DATA *) lua_newuserdata(L, data_size);
|
||||
memset(d, 0, data_size);
|
||||
d->ref = luaL_ref(L, LUA_REGISTRYINDEX);
|
||||
d->start = start;
|
||||
d->bucket_shift = shift;
|
||||
d->bucket_count = bins;
|
||||
d->pc_offset = pc_offset;
|
||||
|
||||
if (data) {
|
||||
lua_unref(L, data->ref);
|
||||
}
|
||||
|
||||
data = d;
|
||||
|
||||
// Start the timer
|
||||
if (!platform_hw_timer_init(TIMER_OWNER, NMI_SOURCE, TRUE)) {
|
||||
// Failed to init the timer
|
||||
data = NULL;
|
||||
lua_unref(L, d->ref);
|
||||
luaL_error(L, "Unable to initialize timer");
|
||||
}
|
||||
|
||||
platform_hw_timer_set_func(TIMER_OWNER, hw_timer_cb, 0);
|
||||
platform_hw_timer_arm_us(TIMER_OWNER, 50);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int perf_stop(lua_State *L)
|
||||
{
|
||||
if (!data) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// stop the timer
|
||||
platform_hw_timer_close(TIMER_OWNER);
|
||||
|
||||
DATA *d = data;
|
||||
data = NULL;
|
||||
|
||||
lua_pushnumber(L, d->total_samples);
|
||||
lua_pushnumber(L, d->outside_samples);
|
||||
lua_newtable(L);
|
||||
int i;
|
||||
uint32_t addr = d->start;
|
||||
for (i = 0; i < d->bucket_count; i++, addr += (1 << d->bucket_shift)) {
|
||||
if (d->bucket[i]) {
|
||||
lua_pushnumber(L, addr);
|
||||
lua_pushnumber(L, d->bucket[i]);
|
||||
lua_settable(L, -3);
|
||||
}
|
||||
}
|
||||
|
||||
lua_pushnumber(L, 1 << d->bucket_shift);
|
||||
|
||||
lua_unref(L, d->ref);
|
||||
|
||||
return 4;
|
||||
}
|
||||
|
||||
static const LUA_REG_TYPE perf_map[] = {
|
||||
{ LSTRKEY( "start" ), LFUNCVAL( perf_start ) },
|
||||
{ LSTRKEY( "stop" ), LFUNCVAL( perf_stop ) },
|
||||
{ LNILKEY, LNILVAL }
|
||||
};
|
||||
|
||||
NODEMCU_MODULE(PERF, "perf", perf_map, NULL);
|
|
@ -0,0 +1,66 @@
|
|||
# perf Module
|
||||
|
||||
This module provides simple performance measurement for an application.
|
||||
It samples the program counter roughly every 50 microseconds and builds
|
||||
a histogram of the values that it finds. Since there is only a small amount
|
||||
of memory to store the histogram, the user can specify which area of code
|
||||
is of interest. The default is the enitre flash which contains code. Once the hotspots are
|
||||
identified, then the run can then be repeated with different areas and at different
|
||||
resolutions to get as much information as required.
|
||||
|
||||
## perf.start()
|
||||
Starts a performance monitoring session.
|
||||
|
||||
#### Syntax
|
||||
`perf.start([start[, end[, nbins[, offset]]]])`
|
||||
|
||||
#### Parameters
|
||||
- `start` (optional) The lowest PC address for the histogram. Default is 0x40000000.
|
||||
- `end` (optional) The highest address for the histogram. Default is the end of the used space in the flash memory.
|
||||
- `nbins` (optional) The number of bins in the histogram. Keep this reasonable otherwise
|
||||
you will run out of memory. Default is 1024.
|
||||
- `offset` (Very optional) This specifies the offset of the saved PC value
|
||||
on the interrupt stack. It appears that 20 is the correct value.
|
||||
|
||||
Note that the number of bins is an upper limit. The size of each bin is set to be the smallest power of two
|
||||
such that the number of bins required is less than or equal to the provided number of bins.
|
||||
|
||||
#### Returns
|
||||
Nothing
|
||||
|
||||
## perf.stop()
|
||||
|
||||
Terminates a performance monitoring session and returns the histogram.
|
||||
|
||||
#### Syntax
|
||||
`total, outside, histogram, binsize = perf.stop()`
|
||||
|
||||
#### Returns
|
||||
- `total` The total number of samples captured in this run
|
||||
- `outside` The number of samples that were outside the histogram range
|
||||
- `histogram` The histogram represented as a table indexed by address where the value is the number of samples. The address is the lowest address for the bin.
|
||||
- `binsize` The number of bytes per histogram bin.
|
||||
|
||||
### Example
|
||||
|
||||
perf.start()
|
||||
|
||||
for j = 0, 100 do
|
||||
str = "str"..j
|
||||
end
|
||||
|
||||
tot, out, tbl, binsize = perf.stop()
|
||||
|
||||
print(tot, out)
|
||||
local keyset = {}
|
||||
local n = 0
|
||||
for k,v in pairs(tbl) do
|
||||
n=n+1
|
||||
keyset[n]=k
|
||||
end
|
||||
table.sort(keyset)
|
||||
for kk,k in ipairs(keyset) do print(string.format("%x - %x",k, k + binsize - 1),tbl[k]) end
|
||||
|
||||
This runs a loop creating strings 100 times and then prints out the histogram (after sorting it).
|
||||
This takes around 2,500 samples and provides a good indication of where all the CPU time is
|
||||
being spent.
|
|
@ -55,6 +55,7 @@ pages:
|
|||
- 'net': 'en/modules/net.md'
|
||||
- 'node': 'en/modules/node.md'
|
||||
- 'ow (1-Wire)': 'en/modules/ow.md'
|
||||
- 'perf': 'en/modules/perf.md'
|
||||
- 'pwm' : 'en/modules/pwm.md'
|
||||
- 'rtcmem': 'en/modules/rtcmem.md'
|
||||
- 'rtctime': 'en/modules/rtctime.md'
|
||||
|
|
Loading…
Reference in New Issue