Addition of a Bloom Filter object (#2176)

* Initial checkin
* Add bloom.md into mkdocs
* Added reset and improved info
* Update bloom.c
* Update bloom.md
* Add Wikipedia link
This commit is contained in:
Philip Gladstone 2017-12-03 07:10:59 -05:00 committed by Marcel Stör
parent 87a6a9bdb9
commit ef91580c7b
4 changed files with 297 additions and 0 deletions

View File

@ -23,6 +23,7 @@
//#define LUA_USE_MODULES_AM2320 //#define LUA_USE_MODULES_AM2320
//#define LUA_USE_MODULES_APA102 //#define LUA_USE_MODULES_APA102
#define LUA_USE_MODULES_BIT #define LUA_USE_MODULES_BIT
//#define LUA_USE_MODULES_BLOOM
//#define LUA_USE_MODULES_BMP085 //#define LUA_USE_MODULES_BMP085
//#define LUA_USE_MODULES_BME280 //#define LUA_USE_MODULES_BME280
//#define LUA_USE_MODULES_BME680 //#define LUA_USE_MODULES_BME680

192
app/modules/bloom.c Normal file
View File

@ -0,0 +1,192 @@
/*
* Module for bloom filters
*
* Philip Gladstone, N1DQ
*/
#include "module.h"
#include "lauxlib.h"
#include "c_types.h"
#include "../crypto/sha2.h"
#if defined(LUA_USE_MODULES_BLOOM) && !defined(SHA2_ENABLE)
#error Must have SHA2_ENABLE set for BLOOM module
#endif
typedef struct {
uint8 fns;
uint16 size;
uint32 occupancy;
uint32 buf[];
} bloom_t;
static bool add_or_check(const uint8 *buf, size_t len, bloom_t *filter, bool add) {
SHA256_CTX ctx;
SHA256_Init(&ctx);
SHA256_Update(&ctx, buf, len);
char hash[32];
SHA256_Final(hash, &ctx);
int i;
uint32 bits = filter->size << 5;
uint8 *h = hash;
bool prev = true;
int hstep = filter->fns > 10 ? 2 : 3;
for (i = 0; i < filter->fns; i++) {
uint32 val = (((h[0] << 8) + h[1]) << 8) + h[2];
h += hstep;
val = val % bits;
uint32 offset = val >> 5;
uint32 bit = 1 << (val & 31);
if (!(filter->buf[offset] & bit)) {
prev = false;
if (add) {
filter->buf[offset] |= bit;
filter->occupancy++;
} else {
break;
}
}
}
return prev;
}
static int bloom_filter_check(lua_State *L) {
bloom_t *filter = (bloom_t *)luaL_checkudata(L, 1, "bloom.filter");
size_t length;
const uint8 *buffer = (uint8 *) luaL_checklstring(L, 2, &length);
bool rc = add_or_check(buffer, length, filter, false);
lua_pushboolean(L, rc);
return 1;
}
static int bloom_filter_add(lua_State *L) {
bloom_t *filter = (bloom_t *)luaL_checkudata(L, 1, "bloom.filter");
size_t length;
const uint8 *buffer = (uint8 *) luaL_checklstring(L, 2, &length);
bool rc = add_or_check(buffer, length, filter, true);
lua_pushboolean(L, rc);
return 1;
}
static int bloom_filter_reset(lua_State *L) {
bloom_t *filter = (bloom_t *)luaL_checkudata(L, 1, "bloom.filter");
memset(filter->buf, 0, filter->size << 2);
filter->occupancy = 0;
return 0;
}
static int bloom_filter_info(lua_State *L) {
bloom_t *filter = (bloom_t *)luaL_checkudata(L, 1, "bloom.filter");
lua_pushinteger(L, filter->size << 5);
lua_pushinteger(L, filter->fns);
lua_pushinteger(L, filter->occupancy);
// Now calculate the chance that a FP will be returned
uint64 prob = 1000000;
if (filter->occupancy > 0) {
unsigned int ratio = (filter->size << 5) / filter->occupancy;
int i;
prob = ratio;
for (i = 1; i < filter->fns && prob < 1000000; i++) {
prob = prob * ratio;
}
if (prob < 1000000) {
// try again with some scaling
unsigned int ratio256 = (filter->size << 13) / filter->occupancy;
uint64 prob256 = ratio256;
for (i = 1; i < filter->fns && prob256 < 256000000; i++) {
prob256 = (prob256 * ratio256) >> 8;
}
prob = prob256 >> 8;
}
}
lua_pushinteger(L, prob > 1000000 ? 1000000 : (int) prob);
return 4;
}
static int bloom_create(lua_State *L) {
int items = luaL_checkinteger(L, 1);
int error = luaL_checkinteger(L, 2);
int n = error;
int logp = 0;
while (n > 0) {
n = n >> 1;
logp--;
}
int bits = -items * logp;
bits += bits >> 1;
bits = (bits + 31) & ~31;
if (bits < 256) {
bits = 256;
}
int size = bits >> 3;
int fns = bits / items;
fns = (fns >> 1) + fns / 6;
if (fns < 2) {
fns = 2;
}
if (fns > 15) {
fns = 15;
}
bloom_t *filter = (bloom_t *) lua_newuserdata(L, sizeof(bloom_t) + size);
//
// Associate its metatable
luaL_getmetatable(L, "bloom.filter");
lua_setmetatable(L, -2);
memset(filter, 0, sizeof(bloom_t) + size);
filter->size = size >> 2;
filter->fns = fns;
return 1;
}
static const LUA_REG_TYPE bloom_filter_map[] = {
{ LSTRKEY( "add" ), LFUNCVAL( bloom_filter_add ) },
{ LSTRKEY( "check" ), LFUNCVAL( bloom_filter_check ) },
{ LSTRKEY( "reset" ), LFUNCVAL( bloom_filter_reset ) },
{ LSTRKEY( "info" ), LFUNCVAL( bloom_filter_info ) },
{ LSTRKEY( "__index" ), LROVAL( bloom_filter_map ) },
{ LNILKEY, LNILVAL }
};
// Module function map
static const LUA_REG_TYPE bloom_map[] = {
{ LSTRKEY( "create" ), LFUNCVAL( bloom_create ) },
{ LNILKEY, LNILVAL }
};
LUALIB_API int bloom_open(lua_State *L) {
luaL_rometatable(L, "bloom.filter", (void *)bloom_filter_map);
return 1;
}
NODEMCU_MODULE(BLOOM, "bloom", bloom_map, bloom_open);

103
docs/en/modules/bloom.md Normal file
View File

@ -0,0 +1,103 @@
# Bloom Module
| Since | Origin / Contributor | Maintainer | Source |
| :----- | :-------------------- | :---------- | :------ |
| 2017-11-13 | [Philip Gladstone](https://github.com/pjsg) | [Philip Gladstone](https://github.com/pjsg) | [bloom.c](../../../app/modules/bloom.c)|
This module implements a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter). This is a probabilistic data structure that is used to test for set membership. There are two operations -- `add` and `check` that allow
arbitrary strings to be added to the set or tested for set membership. Since this is a probabilistic data structure, the answer returned can be incorrect. However,
if the string *is* a member of the set, then the `check` operation will always return `true`.
## bloom.create()
Create a filter object.
#### Syntax
`bloom.create(elements, errorrate)`
#### Parameters
- `elements` The largest number of elements to be added to the filter.
- `errorrate` The error rate (the false positive rate). This is represented as `n` where the false positive rate is `1 / n`. This is the maximum rate of `check` returning true when the string is *not* in the set.
#### Returns
A `filter` object.
#### Example
```
filter = bloom.create(10000, 100) -- this will use around 11kB of memory
```
## filter:add()
Adds a string to the set and returns an indication of whether the string was already present.
#### Syntax
`filter:add(string)`
#### Parameters
- `string` The string to be added to the filter set.
#### Returns
`true` if the string was already present in the filter. `false` otherwise.
#### Example
```
if filter:add("apple") then
print ("Seen an apple before!")
else
print ("Noted that the first apple has been seen")
end
```
## filter:check()
Checks to see if a string is present in the filter set.
#### Syntax
`present = filter:check(string)`
#### Parameters
- `string` The string to be checked for membership in the set.
#### Returns
`true` if the string was already present in the filter. `false` otherwise.
#### Example
```
if filter:check("apple") then
print ("Seen an apple before!")
end
```
## filter:reset()
Empties the filter.
#### Syntax
`filter:reset()`
#### Returns
Nothing
#### Example
```
filter:reset()
```
## filter:info()
Get some status information on the filter.
#### Syntax
`bits, fns, occupancy, fprate = filter:info()`
#### Returns
- `bits` The number of bits in the filter.
- `fns` The number of hash functions in use.
- `occupancy` The number of bits set in the filter.
- `fprate` The approximate chance that the next `check` will return `true` when it should return `false`. This is represented as the inverse of the probability -- i.e. as the n in 1-in-n chance. This value is limited to 1,000,000.
#### Example
```
bits, fns, occupancy, fprate = filter:info()
```

View File

@ -41,6 +41,7 @@ pages:
- 'am2320': 'en/modules/am2320.md' - 'am2320': 'en/modules/am2320.md'
- 'apa102': 'en/modules/apa102.md' - 'apa102': 'en/modules/apa102.md'
- 'bit': 'en/modules/bit.md' - 'bit': 'en/modules/bit.md'
- 'bloom' : 'en/modules/bloom.md'
- 'bme280': 'en/modules/bme280.md' - 'bme280': 'en/modules/bme280.md'
- 'bmp085': 'en/modules/bmp085.md' - 'bmp085': 'en/modules/bmp085.md'
- 'cjson': 'en/modules/cjson.md' - 'cjson': 'en/modules/cjson.md'