Add compression to LFS images (#2448)

* Merge of LFS compress, optimize against current dev * Fixes to LFS compress patch
2018-09-29 16:57:51 +03:00 · 2018-09-29 16:57:51 +03:00 · 172fb276ca
parent 0e103a39dd
commit 172fb276ca
26 changed files with 2363 additions and 245 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,10 +1,13 @@
 .gdb_history
 sdk/
 cache/
 .ccache/
 local/
 luac.cross
 user_config.h
 server-ca.crt
 luac.cross
 uz_unzip
 uz_zip
 #ignore Eclipse project files
 .cproject
--- a/app/Makefile
+++ b/app/Makefile
@ -45,6 +45,7 @@ SUBDIRS= 					\
 	fatfs					\
 	esp-gdbstub				\
 	pm					\
 	uzlib					\
 	$(OPT_SEL_MKTARGETS)
 endif # } PDIR
@ -76,6 +77,8 @@ COMPONENTS_eagle.app.v6 = 			\
 	net/libnodemcu_net.a			\
 	mbedtls/libmbedtls.a      		\
 	modules/libmodules.a			\
 	smart/smart.a 				\
 	uzlib/libuzlib.a 			\
 	$(OPT_SEL_COMPONENTS)
--- a/app/lua/Makefile
+++ b/app/lua/Makefile
@ -46,6 +46,7 @@ INCLUDES += -I ../spiffs
 INCLUDES += -I ../libc
 INCLUDES += -I ../modules
 INCLUDES += -I ../platform
 INCLUDES += -I ../uzlib
 PDIR := ../$(PDIR)
 sinclude $(PDIR)Makefile
--- a/app/lua/lflash.c
+++ b/app/lua/lflash.c
@ -16,6 +16,7 @@
 #include "lflash.h"
 #include "platform.h"
 #include "vfs.h"
 #include "uzlib.h"
 #include "c_fcntl.h"
 #include "c_stdio.h"
@ -34,15 +35,52 @@ static uint32_t flashAddrPhys;
 static uint32_t flashSector;
 static uint32_t curOffset;
-#define ALIGN(s)     (((s)+sizeof(size_t)-1) & ((size_t) (- (signed) sizeof(size_t))))
+#define ALIGN(s)      (((s)+sizeof(size_t)-1) & ((size_t) (- (signed) sizeof(size_t))))
 #define ALIGN_BITS(s) (((uint32_t)s) & (sizeof(size_t)-1))
-#define ALL_SET      cast(uint32_t, -1)
+#define ALL_SET       (~0)
-#define FLASH_SIZE   LUA_FLASH_STORE
+#define FLASH_SIZE    LUA_FLASH_STORE
 #define FLASH_PAGE_SIZE INTERNAL_FLASH_SECTOR_SIZE
-#define FLASH_PAGES  (FLASH_SIZE/FLASH_PAGE_SIZE)
+#define FLASH_PAGES   (FLASH_SIZE/FLASH_PAGE_SIZE)
 #define READ_BLOCKSIZE      1024
 #define WRITE_BLOCKSIZE     2048
 #define DICTIONARY_WINDOW  16384
 #define WORDSIZE           (sizeof(int))
 #define BITS_PER_WORD         32
 #define WRITE_BLOCKS       ((DICTIONARY_WINDOW/WRITE_BLOCKSIZE)+1)
 #define WRITE_BLOCK_WORDS  (WRITE_BLOCKSIZE/WORDSIZE)
 char flash_region_base[FLASH_SIZE] ICACHE_FLASH_RESERVED_ATTR;
 struct INPUT {
  int      fd;
  int      len;
  uint8_t  block[READ_BLOCKSIZE];
  uint8_t *inPtr;
  int      bytesRead;
  int      left;
  void    *inflate_state;
 } *in;
 typedef struct {
  uint8_t byte[WRITE_BLOCKSIZE];
 } outBlock;
 struct OUTPUT {
  lua_State *L;
  lu_int32  flash_sig;
  int       len;
  outBlock *block[WRITE_BLOCKS];
  outBlock  buffer;
  int       ndx;
  uint32_t  crc;
  int     (*fullBlkCB) (void);
  int       flashLen;
  int       flagsLen;
  int       flagsNdx;
  uint32_t *flags;
  const char *error;
 } *out;
 #ifdef NODE_DEBUG
 extern void dbg_printf(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
 void dumpStrt(stringtable *tb, const char *type) {
@ -104,25 +142,34 @@ static void flashErase(uint32_t start, uint32_t end){
    platform_flash_erase_sector( flashSector + i );
 }
 /* =====================================================================================
 * luaN_init(), luaN_reload_reboot() and luaN_index() are exported via lflash.h.
 * The first is the startup hook used in lstate.c and the last two are
 * implementations of the node.flash API calls.
 */
 /*
 * Hook in lstate.c:f_luaopen() to set up ROstrt and ROpvmain if needed
 */
 LUAI_FUNC void luaN_init (lua_State *L) {
-//  luaL_dbgbreak();
+  curOffset       = 0;
-  curOffset      = 0;
+  flashAddr       = flash_region_base;
-  flashAddr     = flash_region_base;
+  flashAddrPhys   = platform_flash_mapped2phys((uint32_t)flashAddr);
-  flashAddrPhys = platform_flash_mapped2phys((uint32_t)flashAddr);
+  flashSector     = platform_flash_get_sector_of_address(flashAddrPhys);
  flashSector   = platform_flash_get_sector_of_address(flashAddrPhys);
  FlashHeader *fh = cast(FlashHeader *, flashAddr);
  /*
-   * For the LFS to be valid, its signature has to be correct for this build variant,
+   * For the LFS to be valid, its signature has to be correct for this build
-   * thr ROhash and main proto fields must be defined and the main proto address 
+   * variant, the ROhash and main proto fields must be defined and the main proto
-   * be within the LFS address bounds. (This last check is primarily to detect the
+   * address be within the LFS address bounds. (This last check is primarily to
-   * direct imaging of an absolute LFS with the wrong base address. 
+   * detect the direct imaging of an absolute LFS with the wrong base address.
   */
  if (fh->flash_sig == 0 || fh->flash_sig == ~0 ) {
    NODE_ERR("No LFS image loaded\n");
    return;
  }
  if ((fh->flash_sig & (~FLASH_SIG_ABSOLUTE)) != FLASH_SIG ) {
    NODE_ERR("Flash sig not correct: %p vs %p\n",
       fh->flash_sig & (~FLASH_SIG_ABSOLUTE), FLASH_SIG);
@ -142,94 +189,76 @@ LUAI_FUNC void luaN_init (lua_State *L) {
  G(L)->ROpvmain    = cast(Proto *,fh->mainProto);
 }
-#define BYTE_OFFSET(t,f) cast(size_t, &(cast(t *, NULL)->f))
+//extern void software_reset(void);
-/*
+static int loadLFS (lua_State *L);
- * Rehook address chain to correct Flash byte addressed within the mapped adress space 
+static int loadLFSgc (lua_State *L);
- * Note that on input each 32-bit address field is split into 2×16-bit subfields
+static int procFirstPass (void);
 *  -  the lu_int16 offset of the target address being referenced
 *  -  the lu_int16 offset of the next address pointer. 
 */
 static int rebuild_core (int fd, uint32_t size, lu_int32 *buf, int is_absolute) {
  int bi;  /* byte offset into memory mapped LFS of current buffer */ 
  int wNextOffset = BYTE_OFFSET(FlashHeader,mainProto)/sizeof(lu_int32);
  int wj;  /* word offset into current input buffer */
  for (bi = 0; bi < size; bi += FLASH_PAGE_SIZE) {
    int wi   = bi / sizeof(lu_int32);
    int blen = ((bi + FLASH_PAGE_SIZE) < size) ? FLASH_PAGE_SIZE : size - bi;
    int wlen = blen / sizeof(lu_int32);
    if (vfs_read(fd, buf , blen) != blen)
      return 0;
    if (!is_absolute) {
      for (wj = 0; wj < wlen; wj++) {
        if ((wi + wj) == wNextOffset) {  /* this word is the next linked address */
          int wTargetOffset = buf[wj]&0xFFFF;
          wNextOffset = buf[wj]>>16;
          lua_assert(!wNextOffset || (wNextOffset>(wi+wj) && wNextOffset<size/sizeof(lu_int32)));
          buf[wj] = cast(lu_int32, flashAddr + wTargetOffset*sizeof(lu_int32));
        }
      }
    }
    flashBlock(buf, blen);
  }
  return size;
 }
 /*
- * Library function called by node.flash.load(filename).
+ * Library function called by node.flashreload(filename).
 */
 LUALIB_API int luaN_reload_reboot (lua_State *L) {
-  int fd, status, is_absolute;
+  // luaL_dbgbreak();
-  FlashHeader fh;
+  const char *fn = lua_tostring(L, 1), *msg = "";
  int status;
 /*
  * Do a protected call of loadLFS.
  *
  * -  This will normally rewrite the LFS and reboot, with no return.
  * -  If an error occurs then it is sent to the UART.
  * -  If this occured in the 1st pass, the previous LFS is unchanged so it is
  *    safe to return to the calling Lua.
  * -  If in the 1st pass, then the ESP is rebooted.
  */
  status = lua_cpcall(L, &loadLFS, cast(void *,fn));
-  const char *fn = lua_tostring(L, 1);
+  if (!out || out->fullBlkCB == procFirstPass) {
-  if (!fn || !(fd = vfs_open(fn, "r")))
+   /*
-    return 0;
+    * Never entered the 2nd pass, so it is safe to return the error.  Note
    * that I've gone to some trouble to ensure that all dynamically allocated
    * working areas have been freed, so that we have no memory leaks.
    */
    if (status == LUA_ERRMEM)
      msg = "Memory allocation error";
    else if (out && out->error)
      msg = out->error;
    else
      msg = "Unknown Error";
-  if (vfs_read(fd, &fh, sizeof(fh)) != sizeof(fh) ||
+   /* We can clean up and return error */
-      (fh.flash_sig & (~FLASH_SIG_ABSOLUTE)) != FLASH_SIG)
+    lua_cpcall(L, &loadLFSgc, NULL);
-    return 0;
+    lua_settop(L, 0);
    lua_pushstring(L, msg);
    return 1;
  }
-  if (vfs_lseek(fd, -1, VFS_SEEK_END) != fh.flash_size-1 ||
+  if (status == 0) {
-      vfs_lseek(fd, 0, VFS_SEEK_SET) != 0)
+    /* Successful LFS rewrite */
-    return 0;
+    msg = "LFS region updated.  Restarting.";
  } else {
    /* We have errored during the second pass so clear the LFS and reboot */
    if (status == LUA_ERRMEM)
      msg = "Memory allocation error";
    else if (out->error)
      msg = out->error;
    else
      msg = "Unknown Error";
  is_absolute = fh.flash_sig & FLASH_SIG_ABSOLUTE;
  lu_int32 *buffer = luaM_newvector(L, FLASH_PAGE_SIZE / sizeof(lu_int32), lu_int32);
  /*
   * This is the point of no return.  We attempt to rebuild the flash.  If there
   * are any problems them the Flash is going to be corrupt, so the only fallback
   * is to erase it and reboot with a clean but blank flash.  Otherwise the reboot
   * will load the new LFS.
   *
   * Note that the Lua state is not passed into the lua core because from this 
   * point on, we make no calls on the Lua RTS.
   */
  flashErase(0,-1); 
  if (rebuild_core(fd, fh.flash_size, buffer, is_absolute) != fh.flash_size)
    flashErase(0,-1);
-  /*
+  }
-   * Issue a break 0,0.  This will either enter the debugger or force a restart if
+  NODE_ERR(msg);
   * not installed.  Follow this by a H/W timeout is a robust way to insure that
   * other interrupts / callbacks don't fire and reference THE old LFS context.
   */
  asm("break 0,0" ::);
  while (1) {}
  while (1) {}  // Force WDT as the ROM software_reset() doesn't seem to work
  return 0;
 }
 /*
- * In the arg is a valid LFS module name then return the LClosure pointing to it.
+ * If the arg is a valid LFS module name then return the LClosure
- * Otherwise return:
+ * pointing to it. Otherwise return:
 *  -  The Unix time that the LFS was built
 *  -  The base address and length of the LFS
- *  -  An array of the module names in the the LFS
+ *  -  An array of the module names in the LFS
 */
 LUAI_FUNC int luaN_index (lua_State *L) {
  int i;
@ -270,5 +299,262 @@ LUAI_FUNC int luaN_index (lua_State *L) {
  lua_insert(L, 4);
  return 5;
 }
 /* =====================================================================================
 * The following routines use my uzlib which was based on pfalcon's inflate and
 * deflate routines.  The standard NodeMCU make also makes two host tools uz_zip
 * and uz_unzip which also use these and luac.cross uses the deflate. As discussed
 * below, The main action routine loadLFS() calls uzlib_inflate() to do the actual
 * stream inflation but uses three supplied CBs to abstract input and output
 * stream handling.
 *
 * ESP8266 RAM limitations and heap fragmentation are a key implementation
 * constraint and hence these routines use a number of ~2K buffers (11) as
 * working storage.
 *
 * The inflate is done twice, in order to limit storage use and avoid forward /
 * backward reference issues.  However this has a major advantage that the LFS
 * is scanned with the headers, CRC, etc. validated BEFORE the write to flash
 * is started, so the only real chance of failure during the second pass
 * write is if a power fail occurs during the pass.
 */
 static void flash_error(const char *err) {
  if (out)
    out->error = err;
  if (in && in->inflate_state)
    uz_free(in->inflate_state);
  lua_pushnil(out->L);   /* can't use it on a cpcall anyway */
  lua_error(out->L);
 }
 /*
 * uzlib_inflate does a stream inflate on an RFC 1951 encoded data stream.
 * It uses three application-specific CBs passed in the call to do the work:
 *
 * -  get_byte()     CB to return next byte in input stream
 * -  put_byte()     CB to output byte to output buffer
 * -  recall_byte()  CB to output byte to retrieve a historic byte from
 *                   the output buffer.
 *
 *  Note that put_byte() also triggers secondary CBs to do further processing.
 */
 static uint8_t get_byte (void) {
  if (--in->left < 0) {
    /* Read next input block */
    int remaining = in->len - in->bytesRead;
    int wanted    = remaining >= READ_BLOCKSIZE ? READ_BLOCKSIZE : remaining;
    if (vfs_read(in->fd, in->block, wanted) != wanted)
      flash_error("read error on LFS image file");
    system_soft_wdt_feed();
    in->bytesRead += wanted;
    in->inPtr      = in->block;
    in->left       = wanted-1;
  }
  return *in->inPtr++;
 }
 static void put_byte (uint8_t value) {
  int offset = out->ndx % WRITE_BLOCKSIZE;  /* counts from 0 */
  out->block[0]->byte[offset++] = value;
  out->ndx++;
  if (offset == WRITE_BLOCKSIZE || out->ndx == out->len) {
    if (out->fullBlkCB)
      out->fullBlkCB();
    /* circular shift the block pointers (redundant on last block, but so what) */
    outBlock *nextBlock  = out->block[WRITE_BLOCKS - 1];
    memmove(out->block+1, out->block, (WRITE_BLOCKS-1)*sizeof(void*));
    out->block[0] = nextBlock ;
  }
 }
 static uint8_t recall_byte (uint offset) {
  if(offset > DICTIONARY_WINDOW || offset >= out->ndx)
    flash_error("invalid dictionary offset on inflate");
  /* ndx starts at 1. Need relative to 0 */
  uint n   = out->ndx - offset;
  uint pos = n % WRITE_BLOCKSIZE;
  uint blockNo = out->ndx / WRITE_BLOCKSIZE - n  / WRITE_BLOCKSIZE;
  return out->block[blockNo]->byte[pos];
 }
 /*
 * On the first pass the break index is set to call this process at the end
 * of each completed output buffer.
 *  -  On the first call, the Flash Header is checked.
 *  -  On each call the CRC is rolled up for that buffer.
 *  -  Once the flags array is in-buffer this is also captured.
 * This logic is slightly complicated by the last buffer is typically short.
 */
 int procFirstPass (void) {
  int len = (out->ndx % WRITE_BLOCKSIZE) ?
               out->ndx % WRITE_BLOCKSIZE : WRITE_BLOCKSIZE;
  if (out->ndx <= WRITE_BLOCKSIZE) {
    uint32_t fl;
    /* Process the flash header and cache the FlashHeader fields we need */
    FlashHeader *fh = cast(FlashHeader *, out->block[0]);
    out->flashLen   = fh->flash_size;                         /* in bytes */
    out->flagsLen   = (out->len-fh->flash_size)/WORDSIZE;     /* in words */
    out->flash_sig  = fh->flash_sig;
    if ((fh->flash_sig & FLASH_FORMAT_MASK) != FLASH_FORMAT_VERSION)
      flash_error("Incorrect LFS header version");
    if ((fh->flash_sig & FLASH_SIG_B2_MASK) != FLASH_SIG_B2)
      flash_error("Incorrect LFS build type");
    if ((fh->flash_sig & ~FLASH_SIG_ABSOLUTE) != FLASH_SIG)
      flash_error("incorrect LFS header signature");
    if (fh->flash_size > FLASH_SIZE)
      flash_error("LFS Image too big for configured LFS region");
    if ((fh->flash_size & 0x3) ||
         fh->flash_size > FLASH_SIZE ||
         out->flagsLen != 1 + (out->flashLen/WORDSIZE - 1) / BITS_PER_WORD)
      flash_error("LFS length mismatch");
    out->flags = luaM_newvector(out->L, out->flagsLen, uint);
  }
  /* update running CRC */
  out->crc = uzlib_crc32(out->block[0], len, out->crc);
  /* copy out any flag vector */
  if (out->ndx > out->flashLen) {
    int start = out->flashLen - (out->ndx - len);
    if (start < 0) start = 0;
    memcpy(out->flags + out->flagsNdx, out->block[0]->byte + start, len - start);
    out->flagsNdx += (len -start) / WORDSIZE;  /* flashLen and len are word aligned */
  }
  return 1;
 }
 int procSecondPass (void) {
 /*
  * The length rules are different for the second pass since this only processes
  * upto the flashLen and not the full image.  This also works in word units.
  * (We've already validated these are word multiples.)
  */
  int i, len = (out->ndx > out->flashLen) ?
                  (out->flashLen % WRITE_BLOCKSIZE) / WORDSIZE :
                  WRITE_BLOCKSIZE / WORDSIZE;
  uint32_t *buf = (uint32_t *) out->buffer.byte, flags;
 /*
  * Relocate all the addresses tagged in out->flags.  This can't be done in
  * place because the out->blocks are still in use as dictionary content so
  * first copy the block to a working buffer and do the relocation in this.
  */
  memcpy(out->buffer.byte, out->block[0]->byte, WRITE_BLOCKSIZE);
  for (i=0; i<len; i++,flags>>=1 ) {
    if ((i&31)==0)
      flags = out->flags[out->flagsNdx++];
    if (flags&1)
      buf[i] = WORDSIZE*buf[i] + cast(uint32_t, flashAddr);
  }
 /*
  * On first block, set the flash_sig has the in progress bit set and this
  * is not cleared until end.
  */
  if (out->ndx <= WRITE_BLOCKSIZE)
    buf[0] = out->flash_sig | FLASH_SIG_IN_PROGRESS;
  flashBlock(buf, len*WORDSIZE);
  if (out->ndx >= out->flashLen) {
    /* we're done so disable CB and rewrite flash sig to complete flash */
    flashSetPosition(0);
    flashBlock(&out->flash_sig, WORDSIZE);
    out->fullBlkCB = NULL;
  }
 }
 /*
 * loadLFS)() is protected called from luaN_reload_reboot so that it can recover
 * from out of memory and other thrown errors.  loadLFSgc() GCs any resources.
 */
 static int loadLFS (lua_State *L) {
  const char *fn = cast(const char *, lua_touserdata(L, 1));
  int i, n, res;
  uint32_t crc;
  /* Allocate and zero in and out structures */
  in = NULL; out = NULL;
  in  = luaM_new(L, struct INPUT);
  memset(in, 0, sizeof(*in));
  out = luaM_new(L, struct OUTPUT);
  memset(out, 0, sizeof(*out));
  out->L         = L;
  out->fullBlkCB = procFirstPass;
  out->crc       = ~0;
  /* Open LFS image/ file, read unpacked length from last 4 byte and rewind */
  if (!(in->fd = vfs_open(fn, "r")))
    flash_error("LFS image file not found");
  in->len = vfs_size(in->fd);
  if (in->len <= 200 ||        /* size of an empty luac output */
      vfs_lseek(in->fd, in->len-4, VFS_SEEK_SET) != in->len-4 ||
      vfs_read(in->fd, &out->len, sizeof(uint)) != sizeof(uint))
    flash_error("read error on LFS image file");
  vfs_lseek(in->fd, 0, VFS_SEEK_SET);
  /* Allocate the out buffers */
  for(i = 0;  i <= WRITE_BLOCKS; i++)
    out->block[i] = luaM_new(L, outBlock);
  /* first inflate pass */
  if (uzlib_inflate (get_byte, put_byte, recall_byte,
                     in->len, &crc, &in->inflate_state) < 0)
    flash_error("read error on LFS image file");
  if (crc != ~out->crc)
    flash_error("checksum error on LFS image file");
  out->fullBlkCB = procSecondPass;
  out->flagsNdx  = 0;
  out->ndx       = 0;
  in->bytesRead  = in->left = 0;
 /*
  * Once we have completed the 1st pass then the LFS image has passed the
  * basic signature, crc and length checks, so now we can reset the counts
  * to do the actual write to flash on the second pass.
  */
  vfs_lseek(in->fd, 0, VFS_SEEK_SET);
  flashErase(0,(out->flashLen - 1)/FLASH_PAGE_SIZE);
  flashSetPosition(0);
  if (uzlib_inflate(get_byte, put_byte, recall_byte,
                    in->len, &crc, &in->inflate_state) != UZLIB_OK)
  if (res < 0) {
    const char *err[] = {"Data_error during decompression",
                         "Chksum_error during decompression",
                         "Dictionary error during decompression"
                         "Memory_error during decompression"};
    flash_error(err[UZLIB_DATA_ERROR - res]);
  }
  return 0;
 }
 static int loadLFSgc (lua_State *L) {
  int i;
  if (out) {
    for (i = 0; i < WRITE_BLOCKS; i++)
      if (out->block[i])
        luaM_free(L, out->block[i]);
    if (out->flags)
      luaM_freearray(L, out->flags, out->flagsLen, uint32_t);
    luaM_free(L, out);
  }
  if (in) {
    if (in->fd)
      vfs_close(in->fd);
    luaM_free(L, in);
  }
  return 0;
 }
 #endif
--- a/app/lua/lflash.h
+++ b/app/lua/lflash.h
@ -15,7 +15,8 @@
 #else
 # define FLASH_SIG_B1 0x00
 #endif
-
+#define FLASH_FORMAT_VERSION (1 << 8)
 #define FLASH_FORMAT_MASK    0xF00
 #ifdef LUA_PACK_TVALUES
 #ifdef LUA_NUMBER_INTEGRAL
 #error "LUA_PACK_TVALUES is only valid for Floating point builds" 
@ -24,9 +25,10 @@
 #else
 # define FLASH_SIG_B2 0x00
 #endif
 # define FLASH_SIG_B2_MASK 0x04
 #define FLASH_SIG_ABSOLUTE    0x01
 #define FLASH_SIG_IN_PROGRESS 0x08
-#define FLASH_SIG  (0xfafaaf50 | FLASH_SIG_B2 | FLASH_SIG_B1)
+#define FLASH_SIG  (0xfafaa050 | FLASH_FORMAT_VERSION |FLASH_SIG_B2 | FLASH_SIG_B1)
 typedef lu_int32 FlashAddr;
 typedef struct {
--- a/app/lua/lgc.c
+++ b/app/lua/lgc.c
@ -43,7 +43,7 @@
 #define stringmark(s)	if (!isLFSobject(&(s)->tsv)) {reset2bits((s)->tsv.marked, WHITE0BIT, WHITE1BIT);}
-#define isfinalized(u)		testbit((u)->marked, FINALIZEDBIT)
+#define isfinalized(u)		testbit(getmarked(u), FINALIZEDBIT)
 #define markfinalized(u)	l_setbit((u)->marked, FINALIZEDBIT)
@ -73,12 +73,12 @@ static void removeentry (Node *n) {
 static void reallymarkobject (global_State *g, GCObject *o) {
  /* don't mark LFS Protos (or strings) */
-  if (o->gch.tt == LUA_TPROTO && isLFSobject(&(o->gch)))
+  if (gettt(&o->gch) == LUA_TPROTO && isLFSobject(&(o->gch)))
    return;
  lua_assert(iswhite(o) && !isdead(g, o));
  white2gray(o);
-  switch (o->gch.tt) {
+  switch (gettt(&o->gch)) {
    case LUA_TSTRING: {
      return;
    }
@ -295,7 +295,7 @@ static l_mem propagatemark (global_State *g) {
  GCObject *o = g->gray;
  lua_assert(isgray(o));
  gray2black(o);
-  switch (o->gch.tt) {
+  switch (gettt(&o->gch)) {
    case LUA_TTABLE: {
      Table *h = gco2h(o);
      g->gray = h->gclist;
@ -400,7 +400,7 @@ static void cleartable (GCObject *l) {
 static void freeobj (lua_State *L, GCObject *o) {
-  switch (o->gch.tt) {
+  switch (gettt(&o->gch)) {
    case LUA_TPROTO:
      lua_assert(!isLFSobject(&(o->gch)));
      luaF_freeproto(L, gco2p(o));
--- a/app/lua/lgc.h
+++ b/app/lua/lgc.h
@ -102,8 +102,8 @@
 #define fixedstack(x)	l_setbit((x)->marked, FIXEDSTACKBIT)
 #define unfixedstack(x)	resetbit((x)->marked, FIXEDSTACKBIT)
 #ifdef LUA_FLASH_STORE
-#define isLFSobject(x)  testbit((x)->marked, LFSBIT)
+#define isLFSobject(x)  testbit(getmarked(x), LFSBIT)
-#define stringfix(s)    if (!test2bits((s)->tsv.marked, FIXEDBIT, LFSBIT)) {l_setbit((s)->tsv.marked, FIXEDBIT);}
+#define stringfix(s)    if (!test2bits(getmarked(&(s)->tsv), FIXEDBIT, LFSBIT)) {l_setbit((s)->tsv.marked, FIXEDBIT);}
 #else
 #define isLFSobject(x) (0)
 #define stringfix(s)   {l_setbit((s)->tsv.marked, FIXEDBIT);}
--- a/app/lua/lobject.h
+++ b/app/lua/lobject.h
@ -34,20 +34,33 @@
 #define LUA_TUPVAL	(LAST_TAG+2)
 #define LUA_TDEADKEY	(LAST_TAG+3)
 #ifdef __XTENSA__
 /*
 ** force aligned access to critical fields in Flash-based structures
 ** wo is the offset of aligned word in bytes 0,4,8,..
 ** bo is the field within the word in bits 0..31 
 */
 #define GET_BYTE_FN(name,t,wo,bo) \
 static inline lu_byte get ## name(void *o) { \
  lu_byte res;  /* extract named field */ \
  asm ("l32i  %0, %1, " #wo "; extui %0, %0, " #bo ", 8;" : "=r"(res) : "r"(o) : );\
  return res; }  
 #else
 #define GET_BYTE_FN(name,t,wo,bo) \
 static inline lu_byte get ## name(void *o) { return ((t *)o)->name; }
 #endif
 /*
 ** Union of all collectable objects
 */
 typedef union GCObject GCObject;
 /*
 ** Common Header for all collectable objects (in macro form, to be
 ** included in other objects)
 */
 #define CommonHeader	GCObject *next; lu_byte tt; lu_byte marked
 /*
 ** Common header in struct form
 */
@ -55,11 +68,18 @@ typedef struct GCheader {
  CommonHeader;
 } GCheader;
 /*
 ** Word aligned inline access functions for the CommonHeader tt and marked fields.
 ** Note that these MUST be consistent with the CommonHeader definition above.  Arg 
 ** 3 is a word offset (4 bytes in this case) and arg 4 the bit offset in the word.
 */
 GET_BYTE_FN(tt,GCheader,4,0)
 GET_BYTE_FN(marked,GCheader,4,8)
 #if defined(LUA_PACK_VALUE) || defined(ELUA_ENDIAN_BIG) || defined(ELUA_ENDIAN_SMALL)
 # error "NodeMCU does not support the eLua LUA_PACK_VALUE and ELUA_ENDIAN defines"
 #endif
 /*
 ** Union of all Lua values
 */
@ -214,7 +234,6 @@ typedef struct lua_TValue {
 #define iscollectable(o)	(ttype(o) >= LUA_TSTRING)
 typedef TValue *StkId;  /* index to stack elements */
--- a/app/lua/lstring.h
+++ b/app/lua/lstring.h
@ -13,7 +13,7 @@
 #include "lstate.h"
-#define sizestring(s) (sizeof(union TString)+(testbit((s)->marked, READONLYBIT) ? sizeof(char **) : ((s)->len+1)*sizeof(char)))
+#define sizestring(s) (sizeof(union TString)+(testbit(getmarked(s), READONLYBIT) ? sizeof(char **) : ((s)->len+1)*sizeof(char)))
 #define sizeudata(u)	(sizeof(union Udata)+(u)->len)
--- a/app/lua/luac_cross/Makefile
+++ b/app/lua/luac_cross/Makefile
@ -1,12 +1,12 @@
 # 
-# This Make file is called from the core Makefile hierarchy which is a hierarchical
+# This Make file is called from the core Makefile hierarchy with is a hierarchical
-# make which uses parent callbacks to implement inheritance.  However if luac_cross
+# make wwhich uses parent callbacks to implement inheritance.  However is luac_cross
-# build stands outside this it uses the host toolchain to implement a separate
+# build stands outside this and uses the host toolchain to implement a separate
 # host build of the luac.cross image. 
 #
 .NOTPARALLEL:
-CCFLAGS:= -I.. -I../../include -I../../../include -I ../../libc
+CCFLAGS:= -I.. -I../../include -I../../libc -I../../uzlib
 LDFLAGS:= -L$(SDK_DIR)/lib -L$(SDK_DIR)/ld -lm -ldl -Wl,-Map=mapfile
 CCFLAGS += -Wall
@ -31,13 +31,13 @@ LUASRC  := lapi.c      lauxlib.c   lbaselib.c  lcode.c     ldblib.c    ldebug.c
           lrotable.c  lstate.c    lstring.c   lstrlib.c   ltable.c    ltablib.c \
           ltm.c       lundump.c   lvm.c       lzio.c
 LIBCSRC := c_stdlib.c
 UZSRC   := uzlib_deflate.c crc32.c
 #
 # This relies on the files being unique on the vpath
 #
-SRC      := $(LUACSRC) $(LUASRC) $(LIBCSRC)
+SRC      := $(LUACSRC) $(LUASRC) $(LIBCSRC) $(UZSRC)
-vpath %.c .:..:../../libc
+vpath %.c .:..:../../libc:../../uzlib
 ODIR   := .output/$(TARGET)/$(FLAVOR)/obj
@ -47,16 +47,11 @@ DEPS   := $(SRC:%.c=$(ODIR)/%.d)
 CFLAGS = $(CCFLAGS) $(DEFINES)  $(EXTRA_CCFLAGS) $(STD_CFLAGS) $(INCLUDES)
 DFLAGS = $(CCFLAGS) $(DDEFINES) $(EXTRA_CCFLAGS) $(STD_CFLAGS) $(INCLUDES)
-CC := $(WRAPCC) gcc
+CC := gcc
 ECHO := echo
 BUILD_TYPE := $(shell $(CC) $(EXTRA_CCFLAGS) -E -dM - <../../../app/include/user_config.h | grep LUA_NUMBER_INTEGRAL | wc -l)
 ifeq ($(BUILD_TYPE),0)
 IMAGE  := ../../../luac.cross
 else
 IMAGE  := ../../../luac.cross.int
 endif
 .PHONY: test clean all
@ -70,7 +65,6 @@ test :
 	@echo SRC: $(SRC)
 	@echo OBJS: $(OBJS)
 	@echo DEPS: $(DEPS)
 	@echo IMAGE: $(IMAGE)
 clean :
 	$(RM) -r $(ODIR)
--- a/app/lua/luac_cross/lflashimg.c
+++ b/app/lua/luac_cross/lflashimg.c
@ -1,4 +1,4 @@
-/*
+/***--
 ** lflashimg.c
 ** Dump a compiled Proto hiearchy to a RO (FLash) image file
 ** See Copyright Notice in lua.h
@ -19,6 +19,7 @@
 #undef LUA_FLASH_STORE
 #define LUA_FLASH_STORE
 #include "lflash.h"
 #include "uzlib.h"
 //#define LOCAL_DEBUG
@ -46,18 +47,18 @@ typedef unsigned int uint;
 * independent image format, which permits the  on-device image loader to load the LFS
 * image at an appropriate base within the flash address space. As all objects in the
 * LFS can be treated as multiples of 4-byte words, also all address fields are both 
- * word aligned, and any address references within the LFS are also word-aligned, 
+ * word aligned, and any address references within the LFS are also word-aligned.
 * such addresses are stored in a special format, where each PI address is two 
 * 16-bit unsigned offsets:
 *
- *   Bits 0-15 is the offset into the LFS that this address refers to
+ * This version adds gzip compression of the generated LFS image for more efficient
- *   Bits 16-31 is the offset linking to the PIC next address.
+ * over-the-air (OTA) transfer, so the method of tagging address words has been 
 * replaced by a scheme which achieves better compression: an additional bitmap
 * has been added to the image, with each bit corresponding to a word in the image 
 * and set if the corresponding work is an address.  The addresses are stored as
 * signed relative word offsets.
 *
- * Hence the LFS can be up to 256Kb in length and the flash loader can use the forward
+ * The unloader is documented in lflash.c  Note that his relocation process is 
- * links to chain down PI address from the mainProto address at offet 3 to all image 
+ * skipped for absolute addressed images (which are identified by the 
- * addresses during load and convert them to the corresponding correct absolute memory
+ * FLASH_SIG_ABSOLUTE bit setting in the flash signature).
 * addresses.  This reloation process is skipped for absolute addressed images (which
 * are identified by the FLASH_SIG_ABSOLUTE bit setting in the flash signature.
 *
 * The flash image has a standard header detailed in lflash.h
 *
@ -66,7 +67,7 @@ typedef unsigned int uint;
 * and int may not have the same size. Hence addresses with the must be declared as
 * the FlashAddr type rather than typed C pointers and must be accessed through macros.
 *
- * ALso note that image built with a given LUA_PACK_TVALUES / LUA_NUNBER_INTEGRAL
+ * Also note that image built with a given LUA_PACK_TVALUES / LUA_NUNBER_INTEGRAL
 * combination must be loaded into a corresponding firmware build.  Hence these
 * configuration options are also included in the FLash Signature.
 *
@ -96,8 +97,19 @@ typedef struct flashts {       /* This is the fixed 32-bit equivalent of TString
 #endif
 static uint curOffset = 0;
-static uint flashImage[LUA_MAX_FLASH_SIZE];
+
-static unsigned char flashAddrTag[LUA_MAX_FLASH_SIZE/WORDSIZE];
+/*
 * The flashAddrTag is a bit array, one bit per flashImage word denoting
 * whether the corresponding word is a relative address.  The defines
 * are access methods for this bit array.
 */ 
 static uint flashImage[LUA_MAX_FLASH_SIZE + LUA_MAX_FLASH_SIZE/32];
 static uint *flashAddrTag = flashImage + LUA_MAX_FLASH_SIZE;
 #define _TW(v) (v)>>5
 #define _TB(v) (1<<((v)&0x1F))
 #define setFlashAddrTag(v) flashAddrTag[_TW(v)] |= _TB(v)
 #define getFlashAddrTag(v) ((flashAddrTag[_TW(v)]&_TB(v)) != 0) 
 #define fatal luac_fatal
 extern void __attribute__((noreturn)) luac_fatal(const char* message);
@ -115,7 +127,7 @@ static void *flashAlloc(lua_State* L, size_t n) {
  void *p = (void *)(flashImage + curOffset);
  curOffset += ALIGN(n)>>WORDSHIFT;
  if (curOffset > LUA_MAX_FLASH_SIZE) {
-    fatal("Out of Flash memmory");
+    fatal("Out of Flash memory");
  }
  return p;
 }
@ -128,8 +140,8 @@ static void *flashAlloc(lua_State* L, size_t n) {
 #define toFlashAddr(l, pd, s) _toFlashAddr(l, &(pd), s)
 static void _toFlashAddr(lua_State* L, FlashAddr *a, void *p) {
  uint doffset = cast(char *, a) - cast(char *,flashImage);
-  lua_assert(!(doffset & (WORDSIZE-1)));
+  lua_assert(!(doffset & (WORDSIZE-1)));  // check word aligned
-  doffset >>= WORDSHIFT;
+  doffset >>= WORDSHIFT;                  // and convert to a word offset
  lua_assert(doffset <= curOffset);
  if (p) {
    uint poffset = cast(char *, p) - cast(char *,flashImage);
@ -137,10 +149,8 @@ static void _toFlashAddr(lua_State* L, FlashAddr *a, void *p) {
    poffset >>= WORDSHIFT;
    lua_assert(poffset <= curOffset);
    flashImage[doffset] = poffset;     // Set the pointer to the offset
-    flashAddrTag[doffset] = 1;         // And tag as an address
+    setFlashAddrTag(doffset);          // And tag as an address
-  } else {                             // Special case for NULL pointer
+  } /* else leave clear */             // Special case for NULL pointer
    flashImage[doffset] = 0;
  }
 }
 /*
@ -231,7 +241,7 @@ static void createROstrt(lua_State *L, FlashHeader *fh) {
    fts->marked = bitmask(LFSBIT);           // LFS string with no Whitebits set
    fts->hash   = hash;                      // add hash
    fts->len    = len;                       // and length
-    memcpy(flashAlloc(L, ALIGN(len+1)), p, ALIGN(len+1)); // copy string
+    memcpy(flashAlloc(L, len+1), p, len+1);  // copy string
                                             // include the trailing null char
    lua_pop(L, 1);                           // Junk the value
    lua_pushvalue(L, -1);                    // Dup the key as rawset dumps its copy
@ -308,6 +318,9 @@ static void *flashCopy(lua_State* L, int n, const char *fmt, void *src) {
        case 'I':
          *d++ = *s++;
          break;
        case 'H':
          *d++ = (*s++) & 0;
          break;
        case 'S':
          newts = resolveTString(L, *cast(TString **, s));
          toFlashAddr(L, *d, newts);
@ -318,11 +331,15 @@ static void *flashCopy(lua_State* L, int n, const char *fmt, void *src) {
          /* This code has to work for both Integer and Float build variants */
          memset(d, 0, TARGET_TV_SIZE);
          TValue *sv = cast(TValue *, s);
          /* The value is 0, 4 or 8 bytes depending on type */
          if (ttisstring(sv)) {
            toFlashAddr(L, *d, resolveTString(L, rawtsvalue(sv)));
-          } else { /* non-collectable types all of size lua_Number */
+          } else if (ttisnumber(sv)) {
            lua_assert(!iscollectable(sv));
            *cast(lua_Number*,d) = *cast(lua_Number*,s);
          } else if (!ttisnil(sv)){
            /* all other types are 4 byte */
            lua_assert(!iscollectable(sv));
            *cast(uint *,d) = *cast(uint *,s);
          }
          *cast(int *,cast(lua_Number*,d)+1) = ttype(sv);
          s += FLASH_WORDS(TValue);
@ -338,9 +355,9 @@ static void *flashCopy(lua_State* L, int n, const char *fmt, void *src) {
 /* The debug optimised version has a different Proto layout */
 #ifdef LUA_OPTIMIZE_DEBUG
-#define PROTO_COPY_MASK  "AIAAAAAASIIIIIIIAI"
+#define PROTO_COPY_MASK  "AHAAAAAASIIIIIIIAI"
 #else
-#define PROTO_COPY_MASK  "AIAAAAAASIIIIIIIIAI"
+#define PROTO_COPY_MASK  "AHAAAAAASIIIIIIIIAI"
 #endif
 /*
@ -378,44 +395,52 @@ static void *functionToFlash(lua_State* L, const Proto* orig) {
  return cast(void *, flashCopy(L, 1, PROTO_COPY_MASK, &f));
 }
 /*
 * Scan through the tagged addresses.  This operates in one of two modes.
 *  -  If address is non-zero then the offset is converted back into an absolute
 *     mapped flash address using the specified address base.
 *
 *  -  If the address is zero then form a form linked chain with the upper 16 bits
 *     the link to the last offset. As the scan is backwards, this 'last' address
 *     becomes forward reference for the on-chip LFS loader.
 */
 void  linkAddresses(lu_int32 address){
  int i, last = 0;
  for (i = curOffset-1 ; i >= 0; i--) {
    if (flashAddrTag[i]) {
      lua_assert(flashImage[i]<curOffset);
      if (address) {
        flashImage[i] = 4*flashImage[i] + address;
      } else {
        flashImage[i] |= last<<16;
        last = i;
      }
    }
  }
 }
 uint dumpToFlashImage (lua_State* L, const Proto *main, lua_Writer w, 
-                       void* data, int strip, lu_int32 address) {
+                       void* data, int strip, 
                       lu_int32 address, lu_int32 maxSize) {
 // parameter strip is ignored for now
  lua_newtable(L);
  FlashHeader *fh = cast(FlashHeader *, flashAlloc(L, sizeof(FlashHeader)));
  int i, status;
  lua_newtable(L);
  scanProtoStrings(L, main);
  createROstrt(L,  fh);
  toFlashAddr(L, fh->mainProto, functionToFlash(L, main));
  fh->flash_sig = FLASH_SIG + (address ? FLASH_SIG_ABSOLUTE : 0);
  fh->flash_size = curOffset*WORDSIZE;
-  linkAddresses(address);
+  if (fh->flash_size>maxSize) {
-  lua_unlock(L);
+    fatal ("The image is too large for specfied LFS size");
-  int status = w(L, flashImage, curOffset * sizeof(uint), data);
+  }
  if (address) {  /* in absolute mode convert addresses to mapped address */
    for (i = 0 ; i < curOffset; i++)
      if (getFlashAddrTag(i)) 
        flashImage[i] = 4*flashImage[i] + address;
    lua_unlock(L);
    status = w(L, flashImage, fh->flash_size, data);
  } else { /* compressed PI mode */
   /*
    * In image mode, shift the relocation bitmap down directly above
    * the used flashimage.  This consolidated array is then gzipped.
    */
    uint oLen;
    uint8_t *oBuf;
    int bmLen = sizeof(uint)*((curOffset+31)/32);      /* 32 flags to a word */
    memmove(flashImage+curOffset, flashAddrTag, bmLen);
    status = uzlib_compress (&oBuf, &oLen, 
                             (const uint8_t *)flashImage, bmLen+fh->flash_size);
    if (status != UZLIB_OK) {
      luac_fatal("Out of memory during image compression");
    }
    lua_unlock(L);
 #if 0
    status = w(L, flashImage, bmLen+fh->flash_size, data);  
 #else
    status = w(L, oBuf, oLen, data);    
    free(oBuf); 
 #endif
  }
  lua_lock(L);
  return status;
 }
--- a/app/lua/luac_cross/luac.c
+++ b/app/lua/luac_cross/luac.c
@ -35,6 +35,7 @@ static int dumping=1;			/* dump bytecodes? */
 static int stripping=0;	  /* strip debug information? */
 static int flash=0;	  		/* output flash image */
 static lu_int32 address=0;  /* output flash image at absolute location */
 static lu_int32 maxSize=0x40000;  /* maximuum uncompressed image size */
 static int lookup=0;			/* output lookup-style master combination header */
 static char Output[]={ OUTPUT };	/* default output file name */
 static const char* output=Output;	/* actual output file name */
@ -72,6 +73,7 @@ static void usage(const char* message)
 "  -f       output a flash image file\n"
 "  -a addr  generate an absolute, rather than position independent flash image file\n"
 "  -i       generate lookup combination master (default with option -f)\n" 
 "  -m size  maximum LFS image in bytes\n"
 "  -p       parse only\n"
 "  -s       strip debug information\n"
 "  -v       show version information\n"
@ -123,6 +125,13 @@ static int doargs(int argc, char* argv[])
   lookup = 1;
  else if (IS("-l"))			/* list */
   ++listing;
  else if (IS("-m"))			/* specify a maximum image size */
  {
   flash=lookup=1;
   maxSize=strtol(argv[++i],NULL,0);
   if (maxSize & 0xFFF)
     usage(LUA_QL("-e") " maximum size must be a multiple of 4,096");
  }
  else if (IS("-o"))			/* output file */
  {
   output=argv[++i];
@ -264,7 +273,8 @@ struct Smain {
 };
 extern uint dumpToFlashImage (lua_State* L,const Proto *main, lua_Writer w, 
-                              void* data, int strip, lu_int32 address);
+                              void* data, int strip, 
                              lu_int32 address, lu_int32 maxSize);
 static int pmain(lua_State* L)
 {
@ -302,7 +312,7 @@ static int pmain(lua_State* L)
  lua_lock(L);
  if (flash) 
  {
-    result=dumpToFlashImage(L,f,writer, D, stripping, address);
+    result=dumpToFlashImage(L,f,writer, D, stripping, address, maxSize);
  } else
  {
    result=luaU_dump_crosscompile(L,f,writer,D,stripping,target);
--- a/app/user/user_exceptions.c
+++ b/app/user/user_exceptions.c
@ -33,77 +33,70 @@
 #include "user_exceptions.h"
 #define LOAD_MASK   0x00f00fu
 #define L8UI_MATCH  0x000002u
 #define L16UI_MATCH 0x001002u
 #define L16SI_MATCH 0x009002u
 static exception_handler_fn load_store_handler;
 void load_non_32_wide_handler (struct exception_frame *ef, uint32_t cause)
 {
-  /* If this is not EXCCAUSE_LOAD_STORE_ERROR you're doing it wrong! */
+  uint32_t val, insn;
-  (void)cause;
+  (void)cause;  /* If this is not EXCCAUSE_LOAD_STORE_ERROR you're doing it wrong! */
  uint32_t epc1 = ef->epc;
  uint32_t excvaddr;
  uint32_t insn;
  asm (
-    "rsr   %0, EXCVADDR;"    /* read out the faulting address */
+    /*
-    "movi  a4, ~3;"          /* prepare a mask for the EPC */
+     * Move the aligned content of the exception addr to val
-    "and   a4, a4, %2;"      /* apply mask for 32bit aligned base */
+     */
-    "l32i  a5, a4, 0;"       /* load part 1 */
+    "rsr     a6, EXCVADDR;"    /* read out the faulting address */
-    "l32i  a6, a4, 4;"       /* load part 2 */
+    "movi    a5, ~3;"          /* prepare a mask for the EPC */
-    "ssa8l %2;"              /* set up shift register for src op */
+    "and     a5, a5, a6;"      /* apply mask for 32bit aligned base */
-    "src   %1, a6, a5;"      /* right shift to get faulting instruction */
+    "l32i    a5, a5, 0;"       /* load aligned value */
-    :"=r"(excvaddr), "=r"(insn)
+    "ssa8l   a6;"              /* set up shift register for value */
-    :"r"(epc1)
+    "srl     %[val], a5;"      /* shift left to align value */
-    :"a4", "a5", "a6"
+                               /* we are done with a6 = EXCVADDR */
   /*
    *  Move the aligned instruction to insn
    */  
    "movi    a5, ~3;"          /* prepare a mask for the insn */    
    "and     a6, a5, %[epc];"  /* apply mask for 32bit aligned base */
    "l32i    a5, a6, 0;"       /* load part 1 */
    "l32i    a6, a6, 4;"       /* load part 2 */
    "ssa8l   %[epc];"          /* set up shift register for src op */
    "src     %[op], a6, a5;"   /* right shift to get faulting instruction */
    :[val]"=r"(val), [op]"=r"(insn)
    :[epc]"r"(ef->epc)
    :"a5", "a6"
  ); 
-  uint32_t valmask = 0;
+/* These instructions have the format 0xADSBII where AB = opcode and D = dest reg */
-  uint32_t what = insn & LOAD_MASK;
+  uint32_t regno = (insn>>4)&0x0f;                           /* pick out nibble D*/
  uint32_t opcode = (uint8_t) (((insn>>12)<<4)|(insn&0xf));  /* and nibbles AB */
 #define L8UI  0x02u
 #define L16UI 0x12u
 #define L16SI 0x92u
-  if (what == L8UI_MATCH)
+  if (opcode == L8UI) {                       /* L8UI */
-    valmask = 0xffu;
+    val = (uint8_t) val;
-  else if (what == L16UI_MATCH || what == L16SI_MATCH)
+  } else {
-    valmask = 0xffffu;
+    val = (uint16_t) val;                     /* assume L16SI or L16UI */ 
-  else
+    if (opcode == L16SI) {
-  {
+      val = (unsigned)((int)((sint16_t)val)); /* force signed 16->32 bit */
-die:
+    } else if (opcode != L16UI) {
-    /* Turns out we couldn't fix this, so try and chain to the handler
+   /*
-     * that was set. (This is typically a remote GDB break). If none 
+    * Anything other than L8UI, L16SI or L16UI then chain to the next handler
-     * then trigger a system break instead and hang if the break doesn't 
+    * if set (typically a remote GDB break). Otherwise execute the default action
-     * get handled. This is effectively what would happen if the default 
+    * which is to trigger a system break and hang if the break doesn't get handled
-     * handler was installed. */
+    */
-    if (load_store_handler) {
+      if (load_store_handler) {
-      load_store_handler(ef, cause);
+        load_store_handler(NULL, 0 /* ef ,  cause */);
-      return;
+        return;
      } else {
        asm ("break 1, 1");
        while (1) {}
      }
    }
     asm ("break 1, 1");
    while (1) {}
  }   
-
+  ef->a_reg[regno ? regno-1: regno] = val; /* carry out the load */
-  /* Load, shift and mask down to correct size */
+  ef->epc += 3;                            /* resume at following instruction */
  uint32_t val = (*(uint32_t *)(excvaddr & ~0x3));
  val >>= (excvaddr & 0x3) * 8;
  val &= valmask;
  /* Sign-extend for L16SI, if applicable */
  if (what == L16SI_MATCH && (val & 0x8000))
    val |= 0xffff0000;
  int regno = (insn & 0x0000f0u) >> 4;
  if (regno == 1)
    goto die;              /* we can't support loading into a1, just die */
  else if (regno != 0)
    --regno;               /* account for skipped a1 in exception_frame */
  ef->a_reg[regno] = val;  /* carry out the load */
  ef->epc += 3;            /* resume at following instruction */
 }
 /**
 * The SDK's user_main function installs a debugging handler regardless
 * of whether there's a proper handler installed for EXCCAUSE_LOAD_STORE_ERROR,
--- a/app/uzlib/Makefile
+++ b/app/uzlib/Makefile
@ -0,0 +1,45 @@
 #############################################################
 # Required variables for each makefile
 # Discard this section from all parent makefiles
 # Expected variables (with automatic defaults):
 #   CSRCS (all "C" files in the dir)
 #   SUBDIRS (all subdirs with a Makefile)
 #   GEN_LIBS - list of libs to be generated ()
 #   GEN_IMAGES - list of images to be generated ()
 #   COMPONENTS_xxx - a list of libs/objs in the form
 #     subdir/lib to be extracted and rolled up into
 #     a generated lib/image xxx.a ()
 #
 ifndef PDIR
 GEN_LIBS = libuzlib.a
 SUBDIRS = host
 endif
 #############################################################
 # Configuration i.e. compile options etc.
 # Target specific stuff (defines etc.) goes in here!
 # Generally values applying to a tree are captured in the
 #   makefile at its root level - these are then overridden
 #   for a subtree within the makefile rooted therein
 #
 #DEFINES += 
 #############################################################
 # Recursion Magic - Don't touch this!!
 #
 # Each subtree potentially has an include directory
 #   corresponding to the common APIs applicable to modules
 #   rooted at that subtree. Accordingly, the INCLUDE PATH
 #   of a module can only contain the include directories up
 #   its parent path, and not its siblings
 #
 # Required for each makefile to inherit from the parent
 #
 INCLUDES := $(INCLUDES) -I $(PDIR)include
 INCLUDES += -I ./
 INCLUDES += -I ../libc
 PDIR := ../$(PDIR)
 sinclude $(PDIR)Makefile
--- a/app/uzlib/README.md
+++ b/app/uzlib/README.md
@ -0,0 +1,35 @@
 uzlib - Deflate/Zlib-compatible LZ77 compression library
 ======================================================
 This is a heavily modified and cut down version of Paul Sokolovsky's
 uzlib library. This library has exported routines which
 -  Can compress data to a Deflate-compatible bitstream, albeit with lower
 compression ratio than the Zlib Deflate algorithm as a static Deflate Huffman
 tree encoding is used for bitstream). Note that since this compression is
 in RAM and requires ~4 bytes per byte of the input record, should only be
 called for compressing small records on the ESP8266.
 -  Can decompress any valid Deflate, Zlib, and Gzip (further called just
 "Deflate") bitstream less than 16Kb, and any arbitrary length stream
 compressed by the uzlib compressor.
 uzlib aims for minimal code size and runtime memory requirements, and thus
 is suitable for embedded systems and IoT devices such as the ESP8266.
 uzlib is based on:
 -  tinf library by Joergen Ibsen (Deflate decompression)
 -  Deflate Static Huffman tree routines by Simon Tatham
 -  LZ77 compressor by Paul Sokolovsky provided my initial inspiration, but
 I ended up rewriting this following RFC 1951 to get improved compression
 performance.
 The above 16Kb limitation arises from the RFC 1951 use of a 32Kb dictionary,
 which is impractical on a chipset with only ~40 Kb RAM avialable to
 applications.
 The relevant copyright statements are provided in the source files which
 use this code.
 uzlib library is licensed under Zlib license.
--- a/app/uzlib/crc32.c
+++ b/app/uzlib/crc32.c
@ -0,0 +1,62 @@
 /*
 * CRC32 checksum
 *
 * Copyright (c) 1998-2003 by Joergen Ibsen / Jibz
 * All Rights Reserved
 *
 * http://www.ibsensoftware.com/
 *
 * This software is provided 'as-is', without any express
 * or implied warranty.  In no event will the authors be
 * held liable for any damages arising from the use of
 * this software.
 *
 * Permission is granted to anyone to use this software
 * for any purpose, including commercial applications,
 * and to alter it and redistribute it freely, subject to
 * the following restrictions:
 *
 * 1. The origin of this software must not be
 *    misrepresented; you must not claim that you
 *    wrote the original software. If you use this
 *    software in a product, an acknowledgment in
 *    the product documentation would be appreciated
 *    but is not required.
 *
 * 2. Altered source versions must be plainly marked
 *    as such, and must not be misrepresented as
 *    being the original software.
 *
 * 3. This notice may not be removed or altered from
 *    any source distribution.
 */
 /*
 * CRC32 algorithm taken from the zlib source, which is
 * Copyright (C) 1995-1998 Jean-loup Gailly and Mark Adler
 */
 #include <stdint.h>
 static const unsigned int tinf_crc32tab[16] = {
   0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190,
   0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344,
   0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278,
   0xbdbdf21c
 };
 /* crc is previous value for incremental computation, 0xffffffff initially */
 uint32_t uzlib_crc32(const void *data, unsigned int length, uint32_t crc)
 {
   const unsigned char *buf = (const unsigned char *)data;
   unsigned int i;
   for (i = 0; i < length; ++i)
   {
      crc ^= buf[i];
      crc = tinf_crc32tab[crc & 0x0f] ^ (crc >> 4);
      crc = tinf_crc32tab[crc & 0x0f] ^ (crc >> 4);
   }
   // return value suitable for passing in next time, for final value invert it
   return crc/* ^ 0xffffffff*/;
 }
--- a/app/uzlib/host/Makefile
+++ b/app/uzlib/host/Makefile
@ -0,0 +1,67 @@
 # 
 # This Make file is called from the core Makefile hierarchy with is a hierarchical
 # make wwhich uses parent callbacks to implement inheritance.  However is luac_cross
 # build stands outside this and uses the host toolchain to implement a separate
 # host build of the luac.cross image. 
 #
 .NOTPARALLEL:
 CCFLAGS:= -I.. 
 LDFLAGS:= -L$(SDK_DIR)/lib -L$(SDK_DIR)/ld -lm -ldl -Wl,-Map=mapfile
 CCFLAGS += -Wall
 #DEFINES += 
 TARGET = host
 ifeq ($(FLAVOR),debug)
    CCFLAGS        += -O0 -g
    TARGET_LDFLAGS += -O0 -g
    DEFINES        += -DDEBUG_COUNTS
 else
    FLAVOR         =  release
    CCFLAGS        += -O2
    TARGET_LDFLAGS += -O2
 endif
 #
 # This relies on the files being unique on the vpath
 #
 SRC := uz_unzip.c  uz_zip.c  crc32.c uzlib_inflate.c uzlib_deflate.c
 vpath %.c .:..
 ODIR   := .output/$(TARGET)/$(FLAVOR)/obj
 CFLAGS = $(CCFLAGS) $(DEFINES)  $(EXTRA_CCFLAGS) $(STD_CFLAGS) $(INCLUDES)
 DFLAGS = $(CCFLAGS) $(DDEFINES) $(EXTRA_CCFLAGS) $(STD_CFLAGS) $(INCLUDES)
 ROOT = ../../..
 CC := gcc
 ECHO := echo
 IMAGES :=  $(ROOT)/uz_zip $(ROOT)/uz_unzip
 .PHONY: test clean all
 all: $(IMAGES)
 $(ROOT)/uz_zip : $(ODIR)/uz_zip.o $(ODIR)/crc32.o $(ODIR)/uzlib_deflate.o
 	$(CC) $^ -o $@ $(LDFLAGS)
 $(ROOT)/uz_unzip : $(ODIR)/uz_unzip.o $(ODIR)/crc32.o $(ODIR)/uzlib_inflate.o
 	$(CC) $^ -o $@ $(LDFLAGS)
 test :
 	@echo CC: $(CC)
 	@echo SRC: $(SRC)
 	@echo DEPS: $(DEPS)
 clean :
 	$(RM) -r $(ODIR)
 	$(RM) $(IMAGES)
 $(ODIR)/%.o: %.c
 	@mkdir -p $(ODIR);
 	$(CC) $(CFLAGS) -o $@ -c $<
--- a/app/uzlib/host/uz_unzip.c
+++ b/app/uzlib/host/uz_unzip.c
@ -0,0 +1,178 @@
 /************************************************************************
 * NodeMCU unzip wrapper code for uzlib_inflate
 *
 * Note that whilst it would be more straightforward to implement a 
 * simple in memory approach, this utility adopts the same streaming
 * callback architecture as app/lua/lflash.c to enable this code to be
 * tested in a pure host development environment   
 */
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include <stdlib.h>
 #include "uzlib.h"
 /* Test wrapper */
 #define DICTIONARY_WINDOW 16384
 #define READ_BLOCKSIZE 2048
 #define WRITE_BLOCKSIZE 2048
 #define WRITE_BLOCKS ((DICTIONARY_WINDOW/WRITE_BLOCKSIZE)+1)
 #define FREE(v) if (v) uz_free(v)
 typedef uint8_t  uchar;
 typedef uint16_t ushort;
 typedef uint32_t uint;
 struct INPUT {
  FILE    *fin;
  int      len;
  uint8_t  block[READ_BLOCKSIZE];
  uint8_t *inPtr;
  int      bytesRead;
  int      left;
 } *in;
 typedef struct {
  uint8_t byte[WRITE_BLOCKSIZE];
 } outBlock;
 struct OUTPUT {
  FILE     *fout;
  outBlock *block[WRITE_BLOCKS];
  int       ndx;
  int       written;
  int       len;
  uint32_t  crc;
  int       breakNdx;
  int     (*fullBlkCB) (void);
 } *out;
 /*
 * uzlib_inflate does a stream inflate on an RFC 1951 encoded data stream. 
 * It uses three application-specific CBs passed in the call to do the work:
 *
 * -  get_byte()     CB to return next byte in input stream
 * -  put_byte()     CB to output byte to output buffer
 * -  recall_byte()  CB to output byte to retrieve a historic byte from 
 *                   the output buffer.
 *
 *  Note that put_byte() also triggers secondary CBs to do further processing.
 */
 uint8_t get_byte (void) {
  if (--in->left < 0) {
    /* Read next input block */
    int remaining = in->len - in->bytesRead;
    int wanted    = remaining >= READ_BLOCKSIZE ? READ_BLOCKSIZE : remaining;
    if (fread(in->block, 1, wanted, in->fin) != wanted)
      UZLIB_THROW(UZLIB_DATA_ERROR);
    in->bytesRead += wanted;
    in->inPtr      = in->block;
    in->left       = wanted-1;  
  }
  return *in->inPtr++;
 }
 void put_byte (uint8_t value) {
  int offset = out->ndx % WRITE_BLOCKSIZE;  /* counts from 0 */
  out->block[0]->byte[offset++] = value;
  out->ndx++;
  if (offset == WRITE_BLOCKSIZE || out->ndx == out->len) {
    if (out->fullBlkCB)
      out->fullBlkCB();
    /* circular shift the block pointers (redundant on last block, but so what) */
    outBlock *nextBlock  = out->block[WRITE_BLOCKS - 1];
    memmove(out->block+1, out->block, (WRITE_BLOCKS-1)*sizeof(void*));
    out->block[0] = nextBlock;
  }
 }
 uint8_t recall_byte (uint offset) {
  if(offset > DICTIONARY_WINDOW || offset >= out->ndx)
    UZLIB_THROW(UZLIB_DICT_ERROR);
  /* ndx starts at 1. Need relative to 0 */ 
  uint n   = out->ndx - offset;
  uint pos = n % WRITE_BLOCKSIZE;
  uint blockNo = out->ndx / WRITE_BLOCKSIZE - n  / WRITE_BLOCKSIZE;
  return out->block[blockNo]->byte[pos];
 }
 int processOutRec (void) {
  int len = (out->ndx % WRITE_BLOCKSIZE) ? out->ndx % WRITE_BLOCKSIZE :
                                           WRITE_BLOCKSIZE;
  if (fwrite(out->block[0], 1, len, out->fout) != len)
    UZLIB_THROW(UZLIB_DATA_ERROR);
  out->crc = uzlib_crc32(out->block[0], len, out->crc);
  out->written += len;
  if (out->written == out->len) {
    fclose(out->fout);
    out->fullBlkCB = NULL;
  }
  return 1; 
 }
 int main(int argc, char *argv[]) {
  assert (argc==3);
  const char *inFile = argv[1], *outFile = argv[2];
  int i, n, res;
  uint crc;
  void *cxt_not_used;
  assert(sizeof(unsigned int) == 4);
  /* allocate and zero the in and out Blocks */
  assert(in  = uz_malloc(sizeof(*in)));
  assert(out = uz_malloc(sizeof(*out)));
  memset(in, 0, sizeof(*in));
  memset(out, 0, sizeof(*out));
  /* open input files and probe end to read the expanded length */
  assert((in->fin = fopen(inFile, "rb")));
  fseek(in->fin, -4, SEEK_END);
  assert(fread((uchar*)&(out->len), 1, 4, in->fin) == 4);
  in->len = ftell(in->fin);
  fseek(in->fin, 0, SEEK_SET);
  assert((out->fout = fopen(outFile, "wb")));
  printf ("Inflating in=%s out=%s\n", inFile, outFile);
  /* Allocate the out buffers (number depends on the unpacked length) */
  n = (out->len > DICTIONARY_WINDOW) ? WRITE_BLOCKS : 
                                      1 + (out->len-1) / WRITE_BLOCKSIZE;
  for(i = WRITE_BLOCKS - n + 1;  i <= WRITE_BLOCKS; i++)
    assert(out->block[i % WRITE_BLOCKS] = uz_malloc(sizeof(outBlock)));
  out->breakNdx  = (out->len < WRITE_BLOCKSIZE) ? out->len : WRITE_BLOCKSIZE;
  out->fullBlkCB = processOutRec;
  out->crc       = ~0;
  /* Call inflate to do the business */
  res = uzlib_inflate (get_byte, put_byte, recall_byte, in->len, &crc, &cxt_not_used);
  if (res > 0 && crc != ~out->crc)
    res = UZLIB_CHKSUM_ERROR;
  for (i = 0; i < WRITE_BLOCKS; i++)
    FREE(out->block[i]);
  fclose(in->fin);
  FREE(in);
  FREE(out);
  if (res < 0)
    printf("Error during decompression: %d\n", res);
  return (res != 0) ? 1: 0;
 }
--- a/app/uzlib/host/uz_zip.c
+++ b/app/uzlib/host/uz_zip.c
@ -0,0 +1,44 @@
 /************************************************************************
 * NodeMCU zip wrapper code for uzlib_compress
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <unistd.h>
 #include "uzlib.h"
 #define fwriterec(r) fwrite(&(r), sizeof(r), 1, fout);
 #define BAD_FILE (-1)
 int main (int argc, char **argv) {
  const char *in = argv[1], *out = argv[2];
  if (argc!=3)
    return 1;
  printf ("Compressing in=%s out=%s\n", in, out);
  FILE *fin, *fout;
  int status = -1;
  uint32_t iLen, oLen;
  uint8_t *iBuf, *oBuf;
  if (!(fin = fopen(in, "rb")) || fseek(fin, 0, SEEK_END) ||
      (iLen = ftell(fin)) <= 0  || fseek(fin, 0, SEEK_SET))
    return 1;
  if ((fout = fopen(out, "wb")) == NULL ||
      (iBuf = (uint8_t *) uz_malloc(iLen)) == NULL ||     
      fread(iBuf, 1, iLen, fin) != iLen) 
    return 1;
  if (uzlib_compress (&oBuf, &oLen, iBuf, iLen) == UZLIB_OK && 
      oLen == fwrite(oBuf, oLen, 1, fout))
    status = UZLIB_OK;
  uz_free(iBuf);
  if (oBuf) uz_free(oBuf);
  fclose(fin);
  fclose(fout);
  if (status == UZLIB_OK)
    unlink(out);
  return (status == UZLIB_OK) ? 1: 0;
 }
--- a/app/uzlib/uzlib.h
+++ b/app/uzlib/uzlib.h
@ -0,0 +1,74 @@
 /*
 * uzlib  -  tiny deflate/inflate library (deflate, gzip, zlib)
 *
 * Copyright (c) 2003 by Joergen Ibsen / Jibz
 * All Rights Reserved
 * http://www.ibsensoftware.com/
 *
 * Copyright (c) 2014-2016 by Paul Sokolovsky
 */
 #ifndef UZLIB_INFLATE_H
 #define UZLIB_INFLATE_H
 #include <setjmp.h>
 #if defined(__XTENSA__)
 #include "c_stdint.h"
 #include "mem.h"
 #define UZLIB_THROW(v) longjmp(unwindAddr, (v))
 #define UZLIB_SETJMP setjmp
 #define uz_malloc os_malloc
 #define uz_free os_free
 #else  /* POSIX */
 #include <stdint.h>
 #include <stdlib.h>
 extern int dbg_break(void);
 #define UZLIB_THROW(v) {dbg_break();_longjmp(unwindAddr, (v));}
 #define UZLIB_SETJMP _setjmp
 #define uz_malloc malloc
 #define uz_free free
 #endif
 extern jmp_buf unwindAddr;
 /* ok status, more data produced */
 #define UZLIB_OK             0
 /* end of compressed stream reached */
 #define UZLIB_DONE           1
 #define UZLIB_DATA_ERROR    (-3)
 #define UZLIB_CHKSUM_ERROR  (-4)
 #define UZLIB_DICT_ERROR    (-5)
 #define UZLIB_MEMORY_ERROR  (-6)
 /* checksum types */
 #define UZLIB_CHKSUM_NONE  0
 #define UZLIB_CHKSUM_ADLER 1
 #define UZLIB_CHKSUM_CRC   2
 /* Gzip header codes */
 #define UZLIB_FTEXT    1
 #define UZLIB_FHCRC    2
 #define UZLIB_FEXTRA   4
 #define UZLIB_FNAME    8
 #define UZLIB_FCOMMENT 16
 /* Compression API */
 typedef struct uzlib_data UZLIB_DATA;
 int uzlib_inflate (uint8_t (*)(void), void (*)(uint8_t),
                   uint8_t (*)(uint32_t), uint32_t len, uint32_t *crc, void **state);
 int uzlib_compress (uint8_t **dest, uint32_t *destLen,
                    const uint8_t *src, uint32_t srcLen);
 /* Checksum API */
 /* crc is previous value for incremental computation, 0xffffffff initially */
 uint32_t uzlib_crc32(const void *data, uint32_t length, uint32_t crc);
 #endif /* UZLIB_INFLATE_H */
--- a/app/uzlib/uzlib_deflate.c
+++ b/app/uzlib/uzlib_deflate.c
@ -0,0 +1,585 @@
 /*
 * This implementation draws heavily on the work down by Paul Sokolovsky
 * (https://github.com/pfalcon) and his uzlib library which in turn uses
 * work done by Joergen Ibsen, Simon Tatham and others.  All of this work
 * is under an unrestricted right to use subject to copyright attribution.
 * Two copyright wordings (variants A and B) are following.
 *
 * (c) statement A initTables, copy, literal
 *
 * The remainder of this code has been written by me, Terry Ellison 2018,
 * under the standard NodeMCU MIT licence, but is available to the other
 * contributors to this source under any permissive licence.
 *
 * My primary algorthmic reference is RFC 1951: "DEFLATE Compressed Data
 * Format Specification version 1.3", dated May 1996.
 *
 * Also because the code in this module is drawn from different sources,
 * the different coding practices can be confusing, I have standardised
 * the source by:
 *
 * -  Adopting the 2 indent rule as in the rest of the firmware
 *
 * -  I have replaced the various mix of char, unsigned char and uchar
 *    by the single uchar type; ditto for ushort and uint.
 *
 * -  All internal (non-exported) functions and data are static
 *
 * -  Only exported functions and data have the module prefix.  All
 *    internal (static) variables and fields are lowerCamalCase.
 *
 ***********************************************************************
 * Copyright statement A for Zlib (RFC1950 / RFC1951) compression for PuTTY.
 PuTTY is copyright 1997-2014 Simon Tatham.
 Portions copyright Robert de Bath, Joris van Rantwijk, Delian
 Delchev, Andreas Schultz, Jeroen Massar, Wez Furlong, Nicolas Barry,
 Justin Bradford, Ben Harris, Malcolm Smith, Ahmad Khalifa, Markus
 Kuhn, Colin Watson, and CORE SDI S.A.
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation files
 (the "Software"), to deal in the Software without restriction,
 including without limitation the rights to use, copy, modify, merge,
 publish, distribute, sublicense, and/or sell copies of the Software,
 and to permit persons to whom the Software is furnished to do so,
 subject to the following conditions:
 The above copyright notice and this permission notice shall be
 included in all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 NONINFRINGEMENT.  IN NO EVENT SHALL THE COP--YRIGHT HOLDERS BE LIABLE
 FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
 CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ************************************************************************
 Copyright statement B for genlz77 functions:
 *
 * genlz77  -  Generic LZ77 compressor
 *
 * Copyright (c) 2014 by Paul Sokolovsky
 *
 * This software is provided 'as-is', without any express
 * or implied warranty.  In no event will the authors be
 * held liable for any damages arising from the use of
 * this software.
 *
 * Permission is granted to anyone to use this software
 * for any purpose, including commercial applications,
 * and to alter it and redistribute it freely, subject to
 * the following restrictions:
 *
 * 1. The origin of this software must not be
 *    misrepresented; you must not claim that you
 *    wrote the original software. If you use this
 *    software in a product, an acknowledgment in
 *    the product documentation would be appreciated
 *    but is not required.
 *
 * 2. Altered source versions must be plainly marked
 *    as such, and must not be misrepresented as
 *    being the original software.
 *
 * 3. This notice may not be removed or altered from
 *    any source distribution.
 */
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
 #include <assert.h>
 #include "uzlib.h"
 jmp_buf unwindAddr;
 /* Minimum and maximum length of matches to look for, inclusive */
 #define MIN_MATCH      3
 #define MAX_MATCH      258
 /* Max offset of the match to look for, inclusive */
 #define MAX_OFFSET     16384 // 32768  //
 #define OFFSET16_MASK  0x7FFF
 #define NULL_OFFSET    0xFFFF
 #if MIN_MATCH < 3
 #error "Encoding requires a minium match of 3 bytes"
 #endif
 #define SIZE(a) (sizeof(a)/sizeof(*a)) /* no of elements in array */
 #ifdef __XTENSA__
 #define RAM_COPY_BYTE_ARRAY(c,s,sl)  uchar *c = alloca(sl); memcpy(c,s,(sl))
 #else
 #define RAM_COPY_BYTE_ARRAY(c,s,sl)  uchar *c = s;
 #endif
 #define FREE(v) if (v) uz_free(v)
 typedef uint8_t  uchar;
 typedef uint16_t ushort;
 typedef uint32_t uint;
 #ifdef DEBUG_COUNTS
 #define DBG_PRINT(...) printf(__VA_ARGS__)
 #define DBG_COUNT(n) (debugCounts[n]++)
 #define DBG_ADD_COUNT(n,m) (debugCounts[n]+=m)
 int debugCounts[20];
 #else
 #define DBG_PRINT(...)
 #define DBG_COUNT(n)
 #define DBG_ADD_COUNT(n,m)
 #endif
 int dbg_break(void) {return 1;}
 typedef struct {
  ushort code, extraBits, min, max;
 } codeRecord;
 struct dynTables {
  ushort *hashChain;
  ushort *hashTable;
  ushort hashMask;
  ushort hashSlots;
  ushort hashBits;
  ushort dictLen;
  const uchar bitrevNibble[16];
  const codeRecord lenCodes[285-257+1];
  const codeRecord distCodes[29-0+1];
 } *dynamicTables;
 struct outputBuf {
  uchar *buffer;
  uint len, size;
  uint inLen, inNdx;
  uint bits, nBits;
  uint compDisabled;
 } *oBuf;
 /*
 * Set up the constant tables used to drive the compression
 *
 * Constants are stored in flash memory on the ESP8266 NodeMCU firmware
 * builds, but only word aligned data access are supported in hardare so
 * short and byte accesses are handled by a S/W exception handler and are
 * SLOW.  RAM is also at premium, so these short routines are driven by
 * byte vectors copied into RAM and then used to generate temporary RAM
 * tables, which are the same as the above statically declared versions.
 *
 * This might seem a bit convolved but this runs faster and takes up less
 * memory than the original version.  This code also works fine on the
 * x86-64s so we just use one code variant.
 *
 * Note that fixed Huffman trees as defined in RFC 1951 Sec 3.2.5 are
 * always used. Whilst dynamic trees can give better compression for
 * larger blocks, this comes at a performance hit of having to compute
 * these trees. Fixed trees give better compression performance on short
 * blocks and significantly reduce compression times.
 *
 * The following defines are used to initialise these tables.
 */
 #define lenCodes_GEN \
  "\x03\x01\x01\x01\x01\x01\x01\x01\xff\x02\x02\x02\x02\xff\x04\x04\x04\x04" \
  "\xff\x08\x08\x08\x08\xff\x10\x10\x10\x10\xff\x20\x20\x20\x1f\xff\x01\x00"
 #define lenCodes_LEN 29
 #define distCodes_GEN \
  "\x01\x01\x01\x01\xff\x02\x02\xff\x04\x04\xff\x08\x08\xff\x10\x10\xff" \
  "\x20\x20\xff\x40\x40\xff\x86\x86\xff\x87\x87\xff\x88\x88\xff" \
  "\x89\x89\xff\x8a\x8a\xff\x8b\x8b\xff\x8c\x8c"
 #define distCodes_LEN 30
 #define BITREV16 "\x0\x8\x4\xc\x2\xa\x6\xe\x1\x9\x5\xd\x3\xb\x7\xf"
 static void genCodeRecs (const codeRecord *rec, ushort len,
                    char *init, int initLen,
                    ushort start, ushort m0) {
  DBG_COUNT(0);
  int       i, b=0, m=0, last=m0;
  RAM_COPY_BYTE_ARRAY(c, (uchar *)init,initLen);
  codeRecord *p = (codeRecord *) rec;
  for (i = start; i < start+len; i++, c++) {
    if (*c == 0xFF)
      b++, c++;
    m +=!(*c & 0x80) ? *c & 0x7F : 2 << *c;
    *p++ = (codeRecord) {i, b, last + 1, (last = m)};
  }
 }
 static void initTables (uint chainLen, uint hashSlots) {
  DBG_COUNT(1);
  uint dynamicSize = sizeof(struct dynTables) +
                     sizeof(struct outputBuf) +
                     chainLen * sizeof(ushort) +
                     hashSlots * sizeof(ushort);
  struct dynTables *dt = uz_malloc(dynamicSize);
  memset(dt, 0, dynamicSize);
  dynamicTables = dt;
  /* Do a single malloc for dymanic tables and assign addresses */
  if(!dt )
    UZLIB_THROW(UZLIB_MEMORY_ERROR);
  memcpy((uchar*)dt->bitrevNibble, BITREV16, 16);
  oBuf          = (struct outputBuf *)(dt+1);
  dt->hashTable = (ushort *)(oBuf+1);
  dt->hashChain = dt->hashTable + hashSlots;
  dt->hashSlots = hashSlots;
  dt->hashMask = hashSlots - 1;
  /* As these are offset rather than pointer, 0 is a valid offset */
  /* (unlike NULL), so 0xFFFF is used to denote an unset value */
  memset(dt->hashTable, -1, sizeof(ushort)*hashSlots);
  memset(dt->hashChain, -1, sizeof(ushort)*chainLen);
  /* Generate the code recors for the lenth and distance code tables */
  genCodeRecs(dt->lenCodes, SIZE(dt->lenCodes),
              lenCodes_GEN, sizeof(lenCodes_GEN),
              257,2);
  ((codeRecord *)(dynamicTables->lenCodes+285-257))->extraBits=0;  /* odd ball entry */
  genCodeRecs(dt->distCodes, SIZE(dt->distCodes),
              distCodes_GEN, sizeof(distCodes_GEN),
              0,0);
 }
 /*
 * Routines to output bit streams and byte streams to the output buffer
 */
 void resizeBuffer(void) {
  uchar *nb;
  DBG_COUNT(2);
  /* The outbuf is given an initial size estimate but if we are running */
  /* out of space then extropolate size using current compression */
  double newEstimate = (((double) oBuf->len)*oBuf->inLen) / oBuf->inNdx;
  oBuf->size = 128 + (uint) newEstimate;
  if (!(nb = realloc(oBuf->buffer, oBuf->size)))
    UZLIB_THROW(UZLIB_MEMORY_ERROR);
  oBuf->buffer = nb;
 }
 void outBits(ushort bits, int nBits) {
  DBG_COUNT(3);
  oBuf->bits  |= bits << oBuf->nBits;
  oBuf->nBits += nBits;
  if (oBuf->len >= oBuf->size - sizeof(bits))
    resizeBuffer();
  while (oBuf->nBits >= 8) {
    DBG_PRINT("%02x-", oBuf->bits & 0xFF);
    oBuf->buffer[oBuf->len++] = oBuf->bits & 0xFF;
    oBuf->bits >>= 8;
    oBuf->nBits -= 8;
  }
 }
 void outBitsRev(uchar bits, int nBits) {
  DBG_COUNT(4);
  /* Note that bit reversal only operates on an 8-bit bits field */
  uchar bitsRev = (dynamicTables->bitrevNibble[bits & 0x0f]<<4) |
                  dynamicTables->bitrevNibble[bits>>4];
  outBits(bitsRev, nBits);
 }
 void outBytes(void *bytes, int nBytes) {
  DBG_COUNT(5);
  int i;
  if (oBuf->len >= oBuf->size - nBytes)
    resizeBuffer();
  /* Note that byte output dumps any bits data so the caller must */
  /* flush this first, if necessary */
  oBuf->nBits = oBuf->bits  = 0;
  for (i = 0; i < nBytes; i++) {
    DBG_PRINT("%02x-", *((uchar*)bytes+i));  
    oBuf->buffer[oBuf->len++] = *((uchar*)bytes+i);
  }
 }
 /*
 * Output an literal byte as an 8 or 9 bit code
 */
 void literal (uchar c) {
  DBG_COUNT(6);
  DBG_PRINT("sym: %02x   %c\n", c, c);
  if (oBuf->compDisabled) {
    /* We're in an uncompressed block, so just output the byte. */
    outBits(c, 8);
  } else if (c <= 143) {
    /* 0 through 143 are 8 bits long starting at 00110000. */
    outBitsRev(0x30 + c, 8);
  } else {
    /* 144 through 255 are 9 bits long starting at 110010000. */
    outBits(1, 1);
    outBitsRev(0x90 - 144 + c, 8);
  }
 }
 /*
 * Output a dictionary (distance, length) pars as bitstream codes
 */
 void copy (int distance, int len) {
  DBG_COUNT(7);
  const codeRecord *lenCodes  = dynamicTables->lenCodes, *l;
  const codeRecord *distCodes = dynamicTables->distCodes, *d;
  int i, j, k;
  assert(!oBuf->compDisabled);
  while (len > 0) {
   /*
    * We can transmit matches of lengths 3 through 258
    * inclusive. So if len exceeds 258, we must transmit in
    * several steps, with 258 or less in each step.
    *
    * Specifically: if len >= 261, we can transmit 258 and be
    * sure of having at least 3 left for the next step. And if
    * len <= 258, we can just transmit len. But if len == 259
    * or 260, we must transmit len-3.
    */
    int thislen = (len > 260 ? 258 : len <= 258 ? len : len - 3);
    len -= thislen;
    /*
     * Binary-search to find which length code we're
     * transmitting.
     */
    i = -1;
    j = lenCodes_LEN;
    while (1) {
      assert(j - i >= 2);
      k = (j + i) / 2;
      if (thislen < lenCodes[k].min)
        j = k;
      else if (thislen > lenCodes[k].max)
        i = k;
      else {
        l = &lenCodes[k];
        break;                 /* found it! */
      }
    }
    /*
     * Transmit the length code. 256-279 are seven bits
     * starting at 0000000; 280-287 are eight bits starting at
     * 11000000.
     */
    if (l->code <= 279) {
      outBitsRev((l->code - 256) * 2, 7);
    } else {
      outBitsRev(0xc0 - 280 + l->code, 8);
    }
    /*
     * Transmit the extra bits.
     */
    if (l->extraBits)
      outBits(thislen - l->min, l->extraBits);
    /*
     * Binary-search to find which distance code we're
     * transmitting.
     */
    i = -1;
    j = distCodes_LEN;
    while (1) {
      assert(j - i >= 2);
      k = (j + i) / 2;
      if (distance < distCodes[k].min)
        j = k;
      else if (distance > distCodes[k].max)
        i = k;
      else {
        d = &distCodes[k];
        break;                 /* found it! */
      }
    }
    /*
     * Transmit the distance code. Five bits starting at 00000.
     */
    outBitsRev(d->code * 8, 5);
    /*
     * Transmit the extra bits.
     */
    if (d->extraBits)
      outBits(distance - d->min, d->extraBits);
  }
 }
 /*
 * Block compression uses a hashTable to index into a set of search
 * chainList, where each chain links together the triples of chars within
 * the dictionary (the last MAX_OFFSET bytes of the input buffer) with
 * the same hash index. So for compressing a file of 200Kb, say, with a
 * 16K dictionary (the largest that we can inflate within the memory
 * constraints of the ESP8266), the chainList is 16K slots long, and the
 * hashTable is 4K slots long, so a typical chain will have 4 links.
 *
 * These two tables use 16-bit ushort offsets rather than pointers to
 * save memory (essential on the ESP8266).
 *
 * As per RFC 1951 sec 4, we also implement a "lazy match" procedure
 */
 void uzlibCompressBlock(const uchar *src, uint srcLen) {
  int i, j, k, l;
  uint hashMask     = dynamicTables->hashMask;
  ushort *hashChain = dynamicTables->hashChain;
  ushort *hashTable = dynamicTables->hashTable;
  uint hashShift    = 24 - dynamicTables->hashBits;
  uint lastOffset   = 0, lastLen = 0;
  oBuf->inLen       = srcLen;          /* used for output buffer resizing */
  DBG_COUNT(9);
  for (i = 0; i <= ((int)srcLen) - MIN_MATCH; i++) {
   /*
    * Calculate a hash on the next three chars using the liblzf hash
    * function, then use this via the hashTable to index into the chain
    * of triples within the dictionary window which have the same hash.
    *
    * Note that using 16-bit offsets requires a little manipulation to
    * handle wrap-around and recover the correct offset, but all other
    * working uses uint offsets simply because the compiler generates
    * faster (and smaller in the case of the ESP8266) code.
    *
    * Also note that this code also works for any tail 2 literals; the
    * hash will access beyond the array and will be incorrect, but
    * these can't match and will flush the last cache.
    */
    const uchar *this = src + i, *comp;
    uint base        = i & ~OFFSET16_MASK;
    uint iOffset     = i - base;
    uint maxLen      = srcLen - i;
    uint matchLen    = MIN_MATCH - 1;
    uint matchOffset = 0;
    uint v          = (this[0] << 16) | (this[1] << 8) | this[2];
    uint hash       = ((v >> hashShift) - v) & hashMask;
    uint nextOffset = hashTable[hash];
    oBuf->inNdx = i;                   /* used for output buffer resizing */
    DBG_COUNT(10);
    if (maxLen>MAX_MATCH)
      maxLen = MAX_MATCH;
    hashTable[hash] = iOffset;
    hashChain[iOffset & (MAX_OFFSET-1)] = nextOffset;
    for (l = 0; nextOffset != NULL_OFFSET && l<60; l++) {
      DBG_COUNT(11);
      /* handle the case where base has bumped */
      j = base + nextOffset - ((nextOffset < iOffset) ? 0 : (OFFSET16_MASK + 1));
      if (i - j > MAX_OFFSET)
        break;
      for (k = 0, comp = src + j; this[k] == comp[k] && k < maxLen; k++)
        {}
      DBG_ADD_COUNT(12, k);
      if (k > matchLen) {
         matchOffset = i - j;
         matchLen = k;
      }
      nextOffset = hashChain[nextOffset & (MAX_OFFSET-1)];
    }
    if (lastOffset) {
      if (matchOffset == 0 || lastLen >= matchLen  ) {
        /* ignore this match (or not) and process last */
        DBG_COUNT(14);
        copy(lastOffset, lastLen);
        DBG_PRINT("dic: %6x %6x %6x\n", i-1, lastLen, lastOffset);
        i += lastLen - 1 - 1;
        lastOffset = lastLen = 0;
      } else {
        /* ignore last match and emit a symbol instead; cache this one */
        DBG_COUNT(15);
        literal(this[-1]);
        lastOffset = matchOffset;
        lastLen = matchLen;
      }
    } else { /* no last match */
      if (matchOffset) {
        DBG_COUNT(16);
        /* cache this one */
        lastOffset = matchOffset;
        lastLen = matchLen;
      } else {
        DBG_COUNT(17);
        /* emit a symbol; last already clear */
        literal(this[0]);
      }
    }
  }
  if (lastOffset) {                     /* flush cached match if any */ 
    copy(lastOffset, lastLen);
    DBG_PRINT("dic: %6x %6x %6x\n", i, lastLen, lastOffset);
    i += lastLen - 1;
  }
  while (i < srcLen)
    literal(src[i++]);                  /* flush the last few bytes if needed */
 }
 /*
 * This compress wrapper treats the input stream as a single block for
 * compression using the default Static huffman block encoding
 */
 int uzlib_compress (uchar **dest, uint *destLen, const uchar *src, uint srcLen) {
  uint crc = ~uzlib_crc32(src, srcLen, ~0);
  uint chainLen = srcLen < MAX_OFFSET ? srcLen : MAX_OFFSET;
  uint hashSlots, i, j;
  int status;
  uint FLG_MTIME[] = {0x00088b1f, 0};
  ushort XFL_OS = 0x0304;
  /* The hash table has 4K slots for a 16K chain and scaling down */
  /* accordingly, for an average chain length of 4 links or thereabouts */
  for (i = 256, j = 8 - 2; i < chainLen; i <<= 1)
    j++;
  hashSlots = i >> 2;
  if ((status = UZLIB_SETJMP(unwindAddr)) == 0) {
    initTables(chainLen, hashSlots);
    oBuf->size = srcLen/5;    /* initial guess of a 5x compression ratio */
    oBuf->buffer = uz_malloc(oBuf->size);
    dynamicTables->hashSlots = hashSlots;
    dynamicTables->hashBits = j;
    if(!oBuf->buffer ) {
      status = UZLIB_MEMORY_ERROR;
    } else {
      /* Output gzip and block headers */
      outBytes(FLG_MTIME, sizeof(FLG_MTIME));
      outBytes(&XFL_OS, sizeof(XFL_OS));
      outBits(1, 1); /* Final block */
      outBits(1, 2); /* Static huffman block */
      uzlibCompressBlock(src, srcLen);  /* Do the compress */
      /* Output block finish */
      outBits(0, 7); /* close block */
      outBits(0, 7); /* Make sure all bits are flushed */
      outBytes(&crc, sizeof(crc));
      outBytes(&srcLen, sizeof(srcLen));
      status = UZLIB_OK;
    }
  } else {
    status = UZLIB_OK;
  }
  FREE(dynamicTables);
  for (i=0; i<20;i++) DBG_PRINT("count %u = %u\n",i,debugCounts[i]);
  if (status == UZLIB_OK) {
    uchar *trimBuf = realloc(oBuf->buffer, oBuf->len);
    *dest = trimBuf ? trimBuf : oBuf->buffer;
    *destLen = oBuf->len;
  } else {
    *dest = NULL;
    *destLen = 0;
    FREE(oBuf->buffer);
  }
  return status;
 }
--- a/app/uzlib/uzlib_inflate.c
+++ b/app/uzlib/uzlib_inflate.c
@ -0,0 +1,603 @@
 /*
 * tinfgzip.c   - tiny gzip decompressor
 * tinflate.c  -  tiny inflate
 *
 * The original source headers as below for licence compliance and in
 * full acknowledgement of the originitor contributions.  Modified by
 * Terry Ellison 2018 to provide lightweight stream inflate for NodeMCU
 * Lua.  Modifications are under the standard NodeMCU MIT licence.
 *
 * Copyright (c) 2003 by Joergen Ibsen / Jibz
 * All Rights Reserved
 * http://www.ibsensoftware.com/
 *
 * Copyright (c) 2014-2016 by Paul Sokolovsky
 *
 * This software is provided 'as-is', without any express
 * or implied warranty.  In no event will the authors be
 * held liable for any damages arising from the use of
 * this software.
 *
 * Permission is granted to anyone to use this software
 * for any purpose, including commercial applications,
 * and to alter it and redistribute it freely, subject to
 * the following restrictions:
 *
 * 1. The origin of this software must not be
 *    misrepresented; you must not claim that you
 *    wrote the original software. If you use this
 *    software in a product, an acknowledgment in
 *    the product documentation would be appreciated
 *    but is not required.
 *
 * 2. Altered source versions must be plainly marked
 *    as such, and must not be misrepresented as
 *    being the original software.
 *
 * 3. This notice may not be removed or altered from
 *    any source distribution.
 */
 #include <string.h>
 #ifdef __XTENSA__
 #include "c_stdio.h"
 #else
 #include <stdio.h>
 #endif
 #include "uzlib.h"
 #ifdef DEBUG_COUNTS
 #define DBG_PRINT(...) printf(__VA_ARGS__)
 #define DBG_COUNT(n) (debugCounts[n]++)
 #define DBG_ADD_COUNT(n,m) (debugCounts[n]+=m)
 int debugCounts[20];
 #else
 #define NDEBUG
 #define DBG_PRINT(...)
 #define DBG_COUNT(n)
 #define DBG_ADD_COUNT(n,m)
 #endif
 #define SIZE(arr) (sizeof(arr) / sizeof(*(arr)))
 jmp_buf unwindAddr;
 int dbg_break(void) {return 1;}
 typedef uint8_t  uchar;
 typedef uint16_t ushort;
 typedef uint32_t uint;
 /* data structures */
 typedef struct {
   ushort table[16];  /* table of code length counts */
   ushort trans[288]; /* code -> symbol translation table */
 } UZLIB_TREE;
 struct uzlib_data {
 /*
  * extra bits and base tables for length and distance codes
  */
  uchar  lengthBits[30];
  ushort lengthBase[30];
  uchar  distBits[30];
  ushort distBase[30];
 /*
  * special ordering of code length codes
  */
  uchar  clcidx[19];
 /*
  * dynamic length/symbol and distance trees
  */
  UZLIB_TREE ltree;
  UZLIB_TREE dtree;
 /*
  * methods encapsulate handling of the input and output streams
  */
  uchar (*get_byte)(void);
  void (*put_byte)(uchar b);
  uchar (*recall_byte)(uint offset);
 /*
  * Other state values
  */
  uint destSize;
  uint tag;
  uint bitcount;
  uint lzOffs;
  int  bType;
  int  bFinal;
  uint curLen;
  uint checksum;
 };        
 /*
 * Note on changes to layout, naming, etc.  This module combines extracts
 * from 3 code files from two sources (Sokolovsky, Ibsen et al) with perhaps
 * 30% from me Terry Ellison. These sources had inconsistent layout and
 * naming conventions, plus extra condtional handling of platforms that
 * cannot support NodeMCU. (This is intended to be run compiled and executed
 * on GCC POSIX and XENTA newlib environments.)  So I have (1) reformatted
 * this file in line with NodeMCU rules; (2) demoted all private data and
 * functions to static and removed the redundant name  prefixes; (3) reordered
 * functions into a more logic order; (4) added some ESP architecture
 * optimisations, for example these IoT devices are very RAM limited, so
 * statically allocating large RAM blocks is against programming guidelines.
 */
 static void skip_bytes(UZLIB_DATA *d, int num) {
  if (num)             /* Skip a fixed number of bytes */
    while (num--) (void) d->get_byte();
  else                 /* Skip to next nullchar */
    while (d->get_byte()) {}
 }
 static uint16_t get_uint16(UZLIB_DATA *d) {
  uint16_t v = d->get_byte();
  return v | (d->get_byte() << 8);
 }
 static uint get_le_uint32 (UZLIB_DATA *d) {
  uint v = get_uint16(d);
  return  v | ((uint) get_uint16(d) << 16);
 }
 /* get one bit from source stream */
 static int getbit (UZLIB_DATA *d) {
  uint bit;
  /* check if tag is empty */
  if (!d->bitcount--) {
    /* load next tag */
    d->tag = d->get_byte();
    d->bitcount = 7;
  }
  /* shift bit out of tag */
  bit = d->tag & 0x01;
  d->tag >>= 1;
  return bit;
 }
 /* read a num bit value from a stream and add base */
 static uint read_bits (UZLIB_DATA *d, int num, int base) {
 /* This is an optimised version which doesn't call getbit num times */
  if (!num) 
    return base;
  uint i, n = (((uint)-1)<<num); 
  for (i = d->bitcount; i < num; i +=8)
    d->tag |= ((uint)d->get_byte()) << i;
  n = d->tag & ~n;
  d->tag >>= num;
  d->bitcount = i - num;
  return base + n;
 }
 /* --------------------------------------------------- *
 * -- uninitialized global data (static structures) -- *
 * --------------------------------------------------- */
 /*
 * Constants are stored in flash memory on the ESP8266 NodeMCU firmware
 * builds, but only word aligned data access are supported in hardare so
 * short and byte accesses are handled by a S/W exception handler and
 * are SLOW.  RAM is also at premium, especially static initialised vars,
 * so we malloc a single block on first call to hold all tables and call
 * the dynamic generator to generate malloced RAM tables that have the
 * same content as the above statically declared versions.
 *
 * This might seem a bit convolved but this runs faster and takes up
 * less memory than the static version on the ESP8266.
 */
 #define CLCIDX_INIT \
 "\x10\x11\x12\x00\x08\x07\x09\x06\x0a\x05\x0b\x04\x0c\x03\x0d\x02\x0e\x01\x0f"
 /* ----------------------- *
 * -- utility functions -- *
 * ----------------------- */
 /* build extra bits and base tables */
 static void build_bits_base (uchar *bits, ushort *base,
                             int delta, int first) {
  int i, sum;
  /* build bits table */
  for (i = 0; i < delta; ++i) bits[i] = 0;
  for (i = 0; i < 30 - delta; ++i) bits[i + delta] = i / delta;
  /* build base table */
  for (sum = first, i = 0; i < 30; ++i) {
    base[i] = sum;
    sum += 1 << bits[i];
  }
 }
 /* build the fixed huffman trees */
 static void build_fixed_trees (UZLIB_TREE *lt, UZLIB_TREE *dt) {
  int i;
  /* build fixed length tree */
  for (i = 0; i < 7; ++i) lt->table[i] = 0;
  lt->table[7] = 24;
  lt->table[8] = 152;
  lt->table[9] = 112;
  for (i = 0; i < 24; ++i)  lt->trans[i] = 256 + i;
  for (i = 0; i < 144; ++i) lt->trans[24 + i] = i;
  for (i = 0; i < 8; ++i)   lt->trans[24 + 144 + i] = 280 + i;
  for (i = 0; i < 112; ++i) lt->trans[24 + 144 + 8 + i] = 144 + i;
  /* build fixed distance tree */
  for (i = 0; i < 5; ++i)   dt->table[i] = 0;
  dt->table[5] = 32;
  for (i = 0; i < 32; ++i)  dt->trans[i] = i;
 }
 /* given an array of code lengths, build a tree */
 static void build_tree (UZLIB_TREE *t, const uchar *lengths, uint num) {
  ushort offs[16];
  uint i, sum;
  /* clear code length count table */
  for (i = 0; i < 16; ++i)
    t->table[i] = 0;
  /* scan symbol lengths, and sum code length counts */
  for (i = 0; i < num; ++i)
    t->table[lengths[i]]++;
  t->table[0] = 0;
  /* compute offset table for distribution sort */
  for (sum = 0, i = 0; i < 16; ++i) {
    offs[i] = sum;
    sum += t->table[i];
  }
  /* create code->symbol translation table (symbols sorted by code) */
  for (i = 0; i < num; ++i) {
    if (lengths[i])
      t->trans[offs[lengths[i]]++] = i;
  }
 }
 /* ---------------------- *
 * -- decode functions -- *
 * ---------------------- */
 /* given a data stream and a tree, decode a symbol */
 static int decode_symbol (UZLIB_DATA *d, UZLIB_TREE *t) {
  int sum = 0, cur = 0, len = 0;
  /* get more bits while code value is above sum */
  do {
    cur = 2*cur + getbit(d);
    if (++len == SIZE(t->table))
      return UZLIB_DATA_ERROR;
    sum += t->table[len];
    cur -= t->table[len];
  } while (cur >= 0);
  sum += cur;
  if (sum < 0 || sum >= SIZE(t->trans))
    return UZLIB_DATA_ERROR;
  return t->trans[sum];
 }
 /* given a data stream, decode dynamic trees from it */
 static int decode_trees (UZLIB_DATA *d, UZLIB_TREE *lt, UZLIB_TREE *dt) {
  uchar lengths[288+32];
  uint hlit, hdist, hclen, hlimit;
  uint i, num, length;
  /* get 5 bits HLIT (257-286) */
  hlit = read_bits(d, 5, 257);
  /* get 5 bits HDIST (1-32) */
  hdist = read_bits(d, 5, 1);
  /* get 4 bits HCLEN (4-19) */
  hclen = read_bits(d, 4, 4);
  for (i = 0; i < 19; ++i) lengths[i] = 0;
  /* read code lengths for code length alphabet */
  for (i = 0; i < hclen; ++i) {
    /* get 3 bits code length (0-7) */
    uint clen = read_bits(d, 3, 0);
    lengths[d->clcidx[i]] = clen;
  }
  /* build code length tree, temporarily use length tree */
  build_tree(lt, lengths, 19);
  /* decode code lengths for the dynamic trees */
  hlimit = hlit + hdist;
  for (num = 0; num < hlimit; ) {
    int sym = decode_symbol(d, lt);
    uchar fill_value = 0;
    int lbits, lbase = 3;
    /* error decoding */
    if (sym < 0)
      return sym;
    switch (sym) {
    case 16:
      /* copy previous code length 3-6 times (read 2 bits) */
      fill_value = lengths[num - 1];
      lbits = 2;
      break;
    case 17:
      /* repeat code length 0 for 3-10 times (read 3 bits) */
      lbits = 3;
      break;
    case 18:
      /* repeat code length 0 for 11-138 times (read 7 bits) */
      lbits = 7;
      lbase = 11;
      break;
    default:
      /* values 0-15 represent the actual code lengths */
      lengths[num++] = sym;
      /* continue the for loop */
      continue;
    }
    /* special code length 16-18 are handled here */
    length = read_bits(d, lbits, lbase);
    if (num + length > hlimit)
      return UZLIB_DATA_ERROR;
    for (; length; --length)
      lengths[num++] = fill_value;
  }
  /* build dynamic trees */
  build_tree(lt, lengths, hlit);
  build_tree(dt, lengths + hlit, hdist);
  return UZLIB_OK;
 }
 /* ----------------------------- *
 * -- block inflate functions -- *
 * ----------------------------- */
 /* given a stream and two trees, inflate a block of data */
 static int inflate_block_data (UZLIB_DATA *d, UZLIB_TREE *lt, UZLIB_TREE *dt) {
  if (d->curLen == 0) {
    int dist;
    int sym = decode_symbol(d, lt);
    /* literal byte */
    if (sym < 256) {
       DBG_PRINT("huff sym: %02x   %c\n", sym, sym);
       d->put_byte(sym);
       return UZLIB_OK;
    }
    /* end of block */
    if (sym == 256)
       return UZLIB_DONE;
    /* substring from sliding dictionary */
    sym -= 257;
    /* possibly get more bits from length code */
    d->curLen = read_bits(d, d->lengthBits[sym], d->lengthBase[sym]);
    dist = decode_symbol(d, dt);
    /* possibly get more bits from distance code */
    d->lzOffs = read_bits(d, d->distBits[dist], d->distBase[dist]);
    DBG_PRINT("huff dict: -%u for %u\n", d->lzOffs, d->curLen);
  }
  /* copy next byte from dict substring */
  uchar b = d->recall_byte(d->lzOffs);
  DBG_PRINT("huff dict byte(%u): -%u -  %02x   %c\n\n",
          d->curLen, d->lzOffs, b, b);
  d->put_byte(b);
  d->curLen--;
  return UZLIB_OK;
 }
 /* inflate an uncompressed block of data */
 static int inflate_uncompressed_block (UZLIB_DATA *d) {
  if (d->curLen == 0) {
    uint length    = get_uint16(d);
    uint invlength = get_uint16(d);
    /* check length */
    if (length != (~invlength & 0x0000ffff))
      return UZLIB_DATA_ERROR;
    /* increment length to properly return UZLIB_DONE below, without
       producing data at the same time */
    d->curLen = length + 1;
    /* make sure we start next block on a byte boundary */
    d->bitcount = 0;
  }
  if (--d->curLen == 0) {
    return UZLIB_DONE;
  }
  d->put_byte(d->get_byte());
  return UZLIB_OK;
 }
 /* -------------------------- *
 * -- main parse functions -- *
 * -------------------------- */
 static int parse_gzip_header(UZLIB_DATA *d) {
  /* check id bytes */
  if (d->get_byte() != 0x1f || d->get_byte() != 0x8b)
    return UZLIB_DATA_ERROR;
  if (d->get_byte() != 8) /* check method is deflate */
    return UZLIB_DATA_ERROR;
  uchar flg = d->get_byte();/* get flag byte */
  if (flg & 0xe0)/* check that reserved bits are zero */
    return UZLIB_DATA_ERROR;
  skip_bytes(d, 6);            /* skip rest of base header of 10 bytes */
  if (flg & UZLIB_FEXTRA)            /* skip extra data if present */
     skip_bytes(d, get_uint16(d));
  if (flg & UZLIB_FNAME)             /* skip file name if present */
    skip_bytes(d,0);
  if (flg & UZLIB_FCOMMENT)          /* skip file comment if present */
    skip_bytes(d,0);
  if (flg & UZLIB_FHCRC)             /* ignore header crc if present */
    skip_bytes(d,2);
  return UZLIB_OK;
 }
 /* inflate next byte of compressed stream */
 static int uncompress_stream (UZLIB_DATA *d) {
  do {
    int res;
    /* start a new block */
    if (d->bType == -1) {
      next_blk:
      /* read final block flag */
      d->bFinal = getbit(d);
      /* read block type (2 bits) */
      d->bType = read_bits(d, 2, 0);
      DBG_PRINT("Started new block: type=%d final=%d\n", d->bType, d->bFinal);
      if (d->bType == 1) {
        /* build fixed huffman trees */
        build_fixed_trees(&d->ltree, &d->dtree);
      } else if (d->bType == 2) {
        /* decode trees from stream */
        res = decode_trees(d, &d->ltree, &d->dtree);
        if (res != UZLIB_OK)
          return res;
      }
    }
    /* process current block */
    switch (d->bType) {
    case 0:
      /* decompress uncompressed block */
      res = inflate_uncompressed_block(d);
      break;
    case 1:
    case 2:
      /* decompress block with fixed or dynamic huffman trees.  These */
      /* trees were decoded previously, so it's the same routine for both */
      res = inflate_block_data(d, &d->ltree, &d->dtree);
      break;
    default:
      return UZLIB_DATA_ERROR;
    }
    if (res == UZLIB_DONE && !d->bFinal) {
      /* the block has ended (without producing more data), but we
         can't return without data, so start procesing next block */
      goto next_blk;
    }
    if (res != UZLIB_OK)
      return res;
  } while (--d->destSize);
  return UZLIB_OK;
 }
 /*
 * This implementation has a different usecase to Paul Sokolovsky's
 * uzlib implementation, in that it is designed to target IoT devices
 * such as the ESP8266.  Here clarity and compact code size is an
 * advantage, but the ESP8266 only has 40-45Kb free heap, and has to
 * process files with an unpacked size of up 256Kb, so a streaming
 * implementation is essential.
 *
 * I have taken the architectural decision to hide the implementation
 * detials from the uncompress routines and the caller must provide
 * three support routines to handle the streaming:
 *
 *   void get_byte(void)
 *   void put_byte(uchar b)
 *   uchar recall_byte(uint offset)
 *
 * This last must be able to recall an output byte with an offet up to
 * the maximum dictionary size.
 */
 int uzlib_inflate (
     uchar (*get_byte)(void),
     void (*put_byte)(uchar v),
     uchar (*recall_byte)(uint offset),
     uint len, uint *crc, void **state) {
  int res;
  /* initialize decompression structure */
  UZLIB_DATA *d = (UZLIB_DATA *) uz_malloc(sizeof(*d));
  if (!d)
    return UZLIB_MEMORY_ERROR;
  *state = d;
  d->bitcount    = 0;
  d->bFinal      = 0;
  d->bType       = -1;
  d->curLen      = 0;
  d->destSize    = len;
  d->get_byte    = get_byte;
  d->put_byte    = put_byte;
  d->recall_byte = recall_byte;
  if ((res = UZLIB_SETJMP(unwindAddr)) != 0) {
    if (crc)
      *crc = d->checksum;
    /* handle long jump */
    if (d) {
      uz_free(d);
      *state = NULL;
    }
    return res;
  }
  /* create RAM copy of clcidx byte array */
  memcpy(d->clcidx, CLCIDX_INIT, sizeof(d->clcidx));
  /* build extra bits and base tables */
  build_bits_base(d->lengthBits, d->lengthBase, 4, 3);
  build_bits_base(d->distBits, d->distBase, 2, 1);
  d->lengthBits[28] = 0;              /* fix a special case */
  d->lengthBase[28] = 258;
  if ((res = parse_gzip_header(d))== UZLIB_OK)
    while ((res = uncompress_stream(d)) == UZLIB_OK)
      {}
  if (res == UZLIB_DONE) {
    d->checksum = get_le_uint32(d);
    (void) get_le_uint32(d);         /* already got length so ignore */ 
  }
  UZLIB_THROW(res);
 }
--- a/docs/en/compiling.md
+++ b/docs/en/compiling.md
@ -0,0 +1,68 @@
 Whilst the Lua Virtual Machine (LVM) can compile Lua source dynamically and this can prove
 very flexible during development, you will use less RAM resources if you precompile 
 your sources before execution.
 ## Compiling Lua directly on your ESP8266
 -  The standard [string.dump \(function)](https://www.lua.org/manual/5.1/manual.html#pdf-string.dump) returns a string containing the binary code for the specified function and you can write this to a SPIFFS file.
 -  [`node.compile()`](modules/node/#nodecompile) wraps this 'load and dump to file' operation into a single atomic library call.
 The issue with both of these approaches is that compilation is RAM-intensive and hence
 you will find that you will need to break your application into a lot of small and 
 compilable modules in order to avoid hitting RAM constraints.  This can be mitigated
 by doing all compiles immediately after a [node.restart()`](modules/node/#noderestart).
 ## Compiling Lua on your PC for Uploading
 If you install `lua` on your development PC or Laptop then you can use the standard Lua
 compiler to syntax check any Lua source before downloading it to the ESP8266 module.  However,
 the NodeMCU compiler output uses different data types (e.g. it supports ROMtables) so the
 compiled output from standard `luac` cannot run on the ESP8266.  
 Compiling source on one platform for use on another (e.g. Intel 64-bit Windows to ESP8266) is 
 known as _cross-compilation_ and the NodeMCU firmware build now automatically generates
 a `luac.cross` image as standard in the firmware root directory; this can be used to
 compile and to syntax-check Lua source on the Development machine for execution under 
 NodeMCU Lua on the ESP8266.
 `luac.cross` will translate Lua source files into binary files that can be later loaded
 and executed by the LVM.  Such binary files, which normally have the `.lc` (lua code) 
 extension are loaded directly by the LVM without the RAM overhead of compilation.
 Each `luac.cross` execution produces a single output file containing the bytecodes
 for all source files given in the output file `luac.out`, but you would normally
 change this with the `-o` option. If you wish you can mix Lua source files (and
 even Lua binary files) on the command line. You can use '-' to indicate the
 standard input as a source file and '--' to signal the end of options (that is, all
 remaining arguments will be treated as files even if they start with '-').
 `luac.cross` supports the standard `luac` options `-l`, `-o`, `-p`, `-s` and `-v`, 
 as well as the `-h` option which produces the current help overview.
 NodeMCU also implements some major extensions to support the use of the 
 [Lua Flash Store (LFS)](lfs.md)), in that it can produce an LFS image file which 
 is loaded as an overlay into the firmware in flash memory; the LVM can access and 
 execute this code directly from flash without needing to store code in RAM.  This
 mode is enabled by specifying the `-f`option.
 `luac.cross` supports two separate image formats:
 -  **Compact relocatable**. This is selected by the `-f` option. Here the compiler compresses the compiled binary so that image is small for downloading over Wifi/WAN (e.g. a full 64Kb LFS image is compressed down to a 22Kb file.) The LVM processes such image in two passes with the integrity of the image validated on the first, and the LFS itself gets upated on the second.  The LVM also checks that the image will fit in the allocated LFS region before loading, but you can also use the `-m` option to throw a compile error if the image is too large, for example `-m 0x10000` will raise an error if the image will not load into a 64Kb regions.
 -  **Absolute**. This is selected by the `-a <baseAddr>` option. Here the compiler fixes all addresses relative to the base address specified. This allows an LFS absolute image to be loaded directly into the ESP flash using a tool such as  `esptool.py`. 
 These two modes target two separate use cases: the compact relocatable format 
 facilitates simple OTA updates to an LFS based Lua application; the absolute format
 facilitates factory installation of LFS based applicaitons.
 Also note that the `app/lua/luac_cross` make and Makefile can be executed to build
 just the `luac.cross` image.  You must first ensure that the following options in
 `app/include/user_config.h` are matched to your target configuration:
 ```c
 //#define LUA_NUMBER_INTEGRAL       // uncomment if you want an integer build
 //#define LUA_FLASH_STORE 0x10000   // uncomment if you LFS support
 ```
 Developers have successfully built this on Linux (including docker builds), MacOS, Win10/WSL and WinX/Cygwin. 
--- a/docs/en/modules/node.md
+++ b/docs/en/modules/node.md
@ -199,11 +199,10 @@ Reload the [LFS (Lua Flash Store)](../lfs.md) with the flash image provided. Fla
 `node.flashreload(imageName)`
 #### Parameters
-`imageName` The of name of a image file in the filesystem to be loaded into the LFS.
+`imageName` The name of a image file in the filesystem to be loaded into the LFS.
 #### Returns
-If the LFS image has the incorrect signature or size, then `false` is returned.
+`Error message`  LFS images are now gzip compressed.  In the case of the `imagename` being a valid LFS image, this is expanded and loaded into flash.  The ESP is then immediately rebooted, _so control is not returned to the calling Lua application_ in the case of a successful reload.  This reload process internally makes two passes through the LFS image file; and on the first it validates the file and header formats and detects any errors.  If any is detected then an error string is returned.
 In the case of the `imagename` being a valid LFS image, this is then loaded into flash.  The ESP is then immediately rebooted so control is not returned to the calling application. 
 ## node.flashsize()
--- a/lua_examples/lfs/_init.lua
+++ b/lua_examples/lfs/_init.lua
@ -78,3 +78,24 @@ end
 G.module       = nil    -- disable Lua 5.0 style modules to save RAM
 package.seeall = nil
 --[[-------------------------------------------------------------------------------
  These replaces the builtins loadfile & dofile with ones which preferentially 
  loads the corresponding module from LFS if present.  Flipping the search order
  is an exercise left to the reader.-
 ---------------------------------------------------------------------------------]]
 local lf, df = loadfile, dofile
 G.loadfile = function(n)
  local mod, ext = n:match("(.*)%.(l[uc]a?)");
  local fn, ba   = index(mod)
  if ba or (ext ~= 'lc' and ext ~= 'lua') then return lf(n) else return fn end
 end
 G.dofile = function(n)
  local mod, ext = n:match("(.*)%.(l[uc]a?)");
  local fn, ba   = index(mod)
  if ba or (ext ~= 'lc' and ext ~= 'lua') then return df(n) else return fn() end
 end
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -26,6 +26,7 @@ pages:
    - Building the firmware: 'en/build.md'
    - Flashing the firmware: 'en/flash.md'
    - Uploading code: 'en/upload.md'
    - Compiling code: 'en/compiling.md'
    - Support: 'en/support.md'
 - FAQs:
    - Lua Developer FAQ: 'en/lua-developer-faq.md'