From ef91580c7b03df2afb7c9be19a95dac41e63c82a Mon Sep 17 00:00:00 2001 From: Philip Gladstone Date: Sun, 3 Dec 2017 07:10:59 -0500 Subject: [PATCH] Addition of a Bloom Filter object (#2176) * Initial checkin * Add bloom.md into mkdocs * Added reset and improved info * Update bloom.c * Update bloom.md * Add Wikipedia link --- app/include/user_modules.h | 1 + app/modules/bloom.c | 192 +++++++++++++++++++++++++++++++++++++ docs/en/modules/bloom.md | 103 ++++++++++++++++++++ mkdocs.yml | 1 + 4 files changed, 297 insertions(+) create mode 100644 app/modules/bloom.c create mode 100644 docs/en/modules/bloom.md diff --git a/app/include/user_modules.h b/app/include/user_modules.h index 456cb5cf..ada4854c 100644 --- a/app/include/user_modules.h +++ b/app/include/user_modules.h @@ -23,6 +23,7 @@ //#define LUA_USE_MODULES_AM2320 //#define LUA_USE_MODULES_APA102 #define LUA_USE_MODULES_BIT +//#define LUA_USE_MODULES_BLOOM //#define LUA_USE_MODULES_BMP085 //#define LUA_USE_MODULES_BME280 //#define LUA_USE_MODULES_BME680 diff --git a/app/modules/bloom.c b/app/modules/bloom.c new file mode 100644 index 00000000..83e801b6 --- /dev/null +++ b/app/modules/bloom.c @@ -0,0 +1,192 @@ +/* + * Module for bloom filters + * + * Philip Gladstone, N1DQ + */ + +#include "module.h" +#include "lauxlib.h" +#include "c_types.h" +#include "../crypto/sha2.h" + +#if defined(LUA_USE_MODULES_BLOOM) && !defined(SHA2_ENABLE) +#error Must have SHA2_ENABLE set for BLOOM module +#endif + +typedef struct { + uint8 fns; + uint16 size; + uint32 occupancy; + uint32 buf[]; +} bloom_t; + +static bool add_or_check(const uint8 *buf, size_t len, bloom_t *filter, bool add) { + SHA256_CTX ctx; + SHA256_Init(&ctx); + SHA256_Update(&ctx, buf, len); + + char hash[32]; + SHA256_Final(hash, &ctx); + + int i; + uint32 bits = filter->size << 5; + uint8 *h = hash; + bool prev = true; + int hstep = filter->fns > 10 ? 2 : 3; + for (i = 0; i < filter->fns; i++) { + uint32 val = (((h[0] << 8) + h[1]) << 8) + h[2]; + h += hstep; + val = val % bits; + + uint32 offset = val >> 5; + uint32 bit = 1 << (val & 31); + + if (!(filter->buf[offset] & bit)) { + prev = false; + if (add) { + filter->buf[offset] |= bit; + filter->occupancy++; + } else { + break; + } + } + } + + return prev; +} + +static int bloom_filter_check(lua_State *L) { + bloom_t *filter = (bloom_t *)luaL_checkudata(L, 1, "bloom.filter"); + size_t length; + const uint8 *buffer = (uint8 *) luaL_checklstring(L, 2, &length); + + bool rc = add_or_check(buffer, length, filter, false); + + lua_pushboolean(L, rc); + return 1; +} + +static int bloom_filter_add(lua_State *L) { + bloom_t *filter = (bloom_t *)luaL_checkudata(L, 1, "bloom.filter"); + size_t length; + const uint8 *buffer = (uint8 *) luaL_checklstring(L, 2, &length); + + bool rc = add_or_check(buffer, length, filter, true); + + lua_pushboolean(L, rc); + return 1; +} + +static int bloom_filter_reset(lua_State *L) { + bloom_t *filter = (bloom_t *)luaL_checkudata(L, 1, "bloom.filter"); + + memset(filter->buf, 0, filter->size << 2); + filter->occupancy = 0; + + return 0; +} + +static int bloom_filter_info(lua_State *L) { + bloom_t *filter = (bloom_t *)luaL_checkudata(L, 1, "bloom.filter"); + + lua_pushinteger(L, filter->size << 5); + lua_pushinteger(L, filter->fns); + lua_pushinteger(L, filter->occupancy); + + // Now calculate the chance that a FP will be returned + uint64 prob = 1000000; + if (filter->occupancy > 0) { + unsigned int ratio = (filter->size << 5) / filter->occupancy; + int i; + + prob = ratio; + + for (i = 1; i < filter->fns && prob < 1000000; i++) { + prob = prob * ratio; + } + + if (prob < 1000000) { + // try again with some scaling + unsigned int ratio256 = (filter->size << 13) / filter->occupancy; + + uint64 prob256 = ratio256; + + for (i = 1; i < filter->fns && prob256 < 256000000; i++) { + prob256 = (prob256 * ratio256) >> 8; + } + + prob = prob256 >> 8; + } + } + + lua_pushinteger(L, prob > 1000000 ? 1000000 : (int) prob); + + return 4; +} + +static int bloom_create(lua_State *L) { + int items = luaL_checkinteger(L, 1); + int error = luaL_checkinteger(L, 2); + + int n = error; + int logp = 0; + while (n > 0) { + n = n >> 1; + logp--; + } + + int bits = -items * logp; + bits += bits >> 1; + + bits = (bits + 31) & ~31; + + if (bits < 256) { + bits = 256; + } + + int size = bits >> 3; + + int fns = bits / items; + fns = (fns >> 1) + fns / 6; + + if (fns < 2) { + fns = 2; + } + if (fns > 15) { + fns = 15; + } + + bloom_t *filter = (bloom_t *) lua_newuserdata(L, sizeof(bloom_t) + size); + // + // Associate its metatable + luaL_getmetatable(L, "bloom.filter"); + lua_setmetatable(L, -2); + + memset(filter, 0, sizeof(bloom_t) + size); + filter->size = size >> 2; + filter->fns = fns; + + return 1; +} + +static const LUA_REG_TYPE bloom_filter_map[] = { + { LSTRKEY( "add" ), LFUNCVAL( bloom_filter_add ) }, + { LSTRKEY( "check" ), LFUNCVAL( bloom_filter_check ) }, + { LSTRKEY( "reset" ), LFUNCVAL( bloom_filter_reset ) }, + { LSTRKEY( "info" ), LFUNCVAL( bloom_filter_info ) }, + { LSTRKEY( "__index" ), LROVAL( bloom_filter_map ) }, + { LNILKEY, LNILVAL } +}; + +// Module function map +static const LUA_REG_TYPE bloom_map[] = { + { LSTRKEY( "create" ), LFUNCVAL( bloom_create ) }, + { LNILKEY, LNILVAL } +}; + +LUALIB_API int bloom_open(lua_State *L) { + luaL_rometatable(L, "bloom.filter", (void *)bloom_filter_map); + return 1; +} + +NODEMCU_MODULE(BLOOM, "bloom", bloom_map, bloom_open); diff --git a/docs/en/modules/bloom.md b/docs/en/modules/bloom.md new file mode 100644 index 00000000..6eca5a5d --- /dev/null +++ b/docs/en/modules/bloom.md @@ -0,0 +1,103 @@ +# Bloom Module +| Since | Origin / Contributor | Maintainer | Source | +| :----- | :-------------------- | :---------- | :------ | +| 2017-11-13 | [Philip Gladstone](https://github.com/pjsg) | [Philip Gladstone](https://github.com/pjsg) | [bloom.c](../../../app/modules/bloom.c)| + + +This module implements a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter). This is a probabilistic data structure that is used to test for set membership. There are two operations -- `add` and `check` that allow +arbitrary strings to be added to the set or tested for set membership. Since this is a probabilistic data structure, the answer returned can be incorrect. However, +if the string *is* a member of the set, then the `check` operation will always return `true`. + +## bloom.create() +Create a filter object. + +#### Syntax +`bloom.create(elements, errorrate)` + +#### Parameters +- `elements` The largest number of elements to be added to the filter. +- `errorrate` The error rate (the false positive rate). This is represented as `n` where the false positive rate is `1 / n`. This is the maximum rate of `check` returning true when the string is *not* in the set. + +#### Returns +A `filter` object. + +#### Example + +``` + filter = bloom.create(10000, 100) -- this will use around 11kB of memory +``` + +## filter:add() +Adds a string to the set and returns an indication of whether the string was already present. + +#### Syntax +`filter:add(string)` + +#### Parameters +- `string` The string to be added to the filter set. + +#### Returns +`true` if the string was already present in the filter. `false` otherwise. + +#### Example + +``` + if filter:add("apple") then + print ("Seen an apple before!") + else + print ("Noted that the first apple has been seen") + end +``` + +## filter:check() +Checks to see if a string is present in the filter set. + +#### Syntax +`present = filter:check(string)` + +#### Parameters +- `string` The string to be checked for membership in the set. + +#### Returns +`true` if the string was already present in the filter. `false` otherwise. + +#### Example + +``` + if filter:check("apple") then + print ("Seen an apple before!") + end +``` + + +## filter:reset() +Empties the filter. + +#### Syntax +`filter:reset()` + +#### Returns +Nothing + +#### Example +``` +filter:reset() +``` + +## filter:info() +Get some status information on the filter. + +#### Syntax +`bits, fns, occupancy, fprate = filter:info()` + +#### Returns +- `bits` The number of bits in the filter. +- `fns` The number of hash functions in use. +- `occupancy` The number of bits set in the filter. +- `fprate` The approximate chance that the next `check` will return `true` when it should return `false`. This is represented as the inverse of the probability -- i.e. as the n in 1-in-n chance. This value is limited to 1,000,000. + +#### Example +``` +bits, fns, occupancy, fprate = filter:info() +``` + diff --git a/mkdocs.yml b/mkdocs.yml index 4b62c304..b6a68f69 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -41,6 +41,7 @@ pages: - 'am2320': 'en/modules/am2320.md' - 'apa102': 'en/modules/apa102.md' - 'bit': 'en/modules/bit.md' + - 'bloom' : 'en/modules/bloom.md' - 'bme280': 'en/modules/bme280.md' - 'bmp085': 'en/modules/bmp085.md' - 'cjson': 'en/modules/cjson.md'