Fix net module data loss & RTOS task unsafety (#2829)

To avoid races between the lwIP callbacks (lwIP RTOS task) and the Lua handlers (LVM RTOS task), the data flow and ownership has been simplified and cleaned up. lwIP callbacks now have no visibility of the userdata struct. They are limited to creating small event objects and task_post()ing them over to the LVM "thread", passing ownership in doing so. The shared identifier then becomes the struct netconn*. On the LVM side, we keep a linked list of active userdata objects. This allows us to retrieve the correct userdata when we get an event with a netconn pointer. Because this list is only ever used within the LVM task, no locking is necessary. The old approach of stashing a userdata pointer into the 'socket' field on the netconn has been removed entirely, as this was both not thread/RTOS-task safe, and also interfered with the IDFs internal use of the socket field (even when using only the netconn layer). As an added benefit, this removed the need for all the SYS_ARCH_PROTECT() locking stuff. The need to track receive events before the corresponding userdata object has been established has been removed by virtue of not reordering the "accept" and the "recv" events any more (previously accepts were posted with medium priority, while the receives where high priority, leading to the observed reordering and associated headaches). The workaround for IDF issue 784 has been removed as it is now not needed and is in fact directly harmful as it results in a double-free. Yay for getting rid of old workarounds! DNS resolution code paths were merged for the two instances of "socket" initiated resolves (connect/dns functions). Also fixed an instance of using a stack variable for receiving the resolved IP address, with said variable going out of scope before the DNS resolution necessarily completed (hello, memory corruption!). Where possible, moved to use the Lua allocator rather than plain malloc. Finally, the NodeMCU task posting mechanism got a polish and an adjustment. Given all the Bad(tm) that tends to happen if something fails task posting, I went through a couple of iterations on how to avoid that. Alas, the preferred solution of blocking non-LVM RTOS tasks until a slot is free turned out to not be viable, as this easily resulted in deadlocks with the lwIP stack. After much deliberation I settled on increasing the number of available queue slots for the task_post() mechanism, but in the interest of user control also now made it user configurable via Kconfig.
2019-07-15 07:20:20 +10:00 · 2019-07-15 07:20:20 +10:00 · f44e6e9639
parent c0145e03f9
commit f44e6e9639
5 changed files with 383 additions and 410 deletions
--- a/components/base_nodemcu/user_main.c
+++ b/components/base_nodemcu/user_main.c
@ -146,6 +146,8 @@ void nodemcu_init(void)

 void app_main (void)
 {
+  task_init();
+
  esp_event_queue =
    xQueueCreate (CONFIG_SYSTEM_EVENT_QUEUE_SIZE, sizeof (system_event_t));
  esp_event_task = task_get_id (handle_esp_event);
--- a/components/modules/net.c
+++ b/components/modules/net.c
--- a/components/task/Kconfig
+++ b/components/task/Kconfig
@ -0,0 +1,22 @@
+menu "NodeMCU task slot configuration"
+
+config NODEMCU_TASK_SLOT_MEMORY
+  int "Task slot buffer size"
+  default 2000
+  range 80 16000
+  help
+      NodeMCU uses a fixed size RTOS queue for messaging between internal
+      LVM tasks as well as from other RTOS tasks. If this queue is too
+      small, events and data will go missing. On the other hand, if the
+      queue is too big, some memory will go unused.
+
+      The default value is chosen to be on the safe side for most use
+      cases. Lowering this value will yield more available RAM for use
+      in Lua, but at the increased risk of data loss. Conversely,
+      increasing this value can help resolve aforementioned data loss
+      issues, if encountered.
+
+      The assigned memory size here gets partitioned to the different
+      task priorities; some rounding down may take place as a result.
+
+endmenu
--- a/components/task/include/task/task.h
+++ b/components/task/include/task/task.h
@ -29,9 +29,11 @@ bool task_post(task_prio_t priority, task_handle_t handle, task_param_t param);

 typedef void (*task_callback_t)(task_param_t param, task_prio_t prio);

-bool task_init_handler(task_prio_t priority, uint8 qlen);
 task_handle_t task_get_id(task_callback_t t);

+/* Init, must be called before any posting happens */
+void task_init (void);
+
 /* RTOS loop to pump task messages until infinity */
 void task_pump_messages (void);

--- a/components/task/task.c
+++ b/components/task/task.c
@ -13,7 +13,6 @@
 #define TASK_HANDLE_MASK    0xFFF80000
 #define TASK_HANDLE_UNMASK  (~TASK_HANDLE_MASK)
 #define TASK_HANDLE_ALLOCATION_BRICK 4   // must be a power of 2
-#define TASK_DEFAULT_QUEUE_LEN 8

 #define CHECK(p,v,msg) if (!(p)) { NODE_DBG ( msg ); return (v); }

@ -40,33 +39,7 @@ static task_callback_t *task_func;
 static int task_count;


-/*
- * Initialise the task handle callback for a given priority.  This doesn't need
- * to be called explicitly as the get_id function will call this lazily.
- */
-bool task_init_handler(task_prio_t priority, uint8 qlen) {
-  if (priority >= TASK_PRIORITY_COUNT)
-    return false;
-
-  if (task_Q[priority] == NULL)
-  {
-    task_Q[priority] = xQueueCreate (qlen, sizeof (task_event_t));
-    return task_Q[priority] != NULL;
-  }
-  else
-    return false;
-}
-
-
 task_handle_t task_get_id(task_callback_t t) {
-  /* Initialise any uninitialised Qs with the default Q len */
-  for (task_prio_t p = TASK_PRIORITY_LOW; p != TASK_PRIORITY_COUNT; ++p)
-  {
-    if (!task_Q[p]) {
-      CHECK(task_init_handler( p, TASK_DEFAULT_QUEUE_LEN ), 0, "Task initialisation failed");
-    }
-  }
-
  if ( (task_count & (TASK_HANDLE_ALLOCATION_BRICK - 1)) == 0 ) {
    /* With a brick size of 4 this branch is taken at 0, 4, 8 ... and the new size is +4 */
    task_func =(task_callback_t *)realloc(
@ -85,15 +58,13 @@ task_handle_t task_get_id(task_callback_t t) {
 bool task_post (task_prio_t priority, task_handle_t handle, task_param_t param)
 {
  if (priority >= TASK_PRIORITY_COUNT ||
-      !task_Q[priority] ||
      (handle & TASK_HANDLE_MASK) != TASK_HANDLE_MONIKER)
    return false;

  task_event_t ev = { handle, param };
  bool res = pdPASS == xQueueSendToBackFromISR (task_Q[priority], &ev, NULL);

-  if (pending) /* only need to raise semaphore if it's been initialised */
-    xSemaphoreGiveFromISR (pending, NULL);
+  xSemaphoreGiveFromISR (pending, NULL);

  return res;
 }
@ -129,9 +100,30 @@ static void dispatch (task_event_t *e, uint8_t prio) {
 }


+void task_init (void)
+{
+  pending = xSemaphoreCreateBinary ();
+
+  // Due to the nature of the RTOS, if we ever fail to do a task_post, we're
+  // up the proverbial creek without a paddle. Each queue slot only costs us
+  // 8 bytes, so it's a worthwhile trade-off to reserve a bit of RAM for this
+  // purpose. Trying to work around the issue in the places which post ends
+  // up being *at least* as bad, so better take the hit where the benefit
+  // is shared. That said, we let the user have the final say.
+  const size_t q_mem = CONFIG_NODEMCU_TASK_SLOT_MEMORY;
+
+  // Rather than blindly sizing everything the same, we try to be a little
+  // bit aware of the typical uses of the queues.
+  const size_t slice = q_mem / (10 * sizeof(task_event_t));
+  static const int qlens[] = { 1 * slice, 5 * slice, 4 * slice };
+
+  for (task_prio_t p = TASK_PRIORITY_LOW; p != TASK_PRIORITY_COUNT; ++p)
+    task_Q[p] = xQueueCreate (qlens[p], sizeof (task_event_t));
+}
+
+
 void task_pump_messages (void)
 {
-  vSemaphoreCreateBinary (pending);
  for (;;)
  {
    task_event_t ev;