table数据结构
首先看一下lua中table的数据结构:
// lobject.h
/*
** Tables
*/
typedef union TKey {
struct {
TValuefields;
struct Node *next; /* for chaining */
} nk;
TValue tvk;
} TKey;
typedef struct Node {
TValue i_val;
TKey i_key;
} Node;
typedef struct Table {
CommonHeader;
lu_byte flags; /* 1<<p means tagmethod(p) is not present */
lu_byte lsizenode; /* log2 of size of `node' array */
struct Table *metatable;
TValue *array; /* array part */
Node *node;
Node *lastfree; /* any free position is before this position */
GCObject *gclist;
int sizearray; /* size of `array' array */
} Table;
table由两部分组成:
array是数组头,sizearray是数组长度。node是哈希表头,lsizenode是以2为底哈希表长度的对数,即 2lsizenode 为哈希表长度。
table的长度
执行如下lua代码:
tb1 = {1, 2, 3, 4, 5}
tb1[7] = 7
print("tb1 length: "..#tb1)
-- tb1 length: 5
tb2 = {1, 2, 3, 4, 5}
tb2[8] = 8
print("tb2 length: "..#tb2)
-- tb2 length: 8
tb3 = {1, 2, 3, 4, 5}
tb3[8] = 8
tb3[9] = 9
print("tb3 length: "..#tb3)
-- tb3 length: 9
tb4 = {1, 2, 3, 4, 5}
tb4[2] = nil
tb4[3] = nil
print("tb4 length: "..#tb4)
-- tb4 length: 5
tb1长度5,tb2长度8,tb3长度9,tb4长度5。
为什么会这样呢?可以找一下lua中#获取table长度的源代码。
在 lvm.c 文件中的函数void luaV_execute (lua_State *L, int nexeccalls)内,我们可以通过调试找到#对应的操作码OP_LEN,发现#获取table长度即是调用了int luaH_getn (Table *t):
// ltable.c
/*
** Try to find a boundary in table `t'. A `boundary' is an integer index
** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil).
*/
int luaH_getn (Table *t) {
unsigned int j = t->sizearray;
if (j > 0 && ttisnil(&t->array[j - 1])) {
/* there is a boundary in the array part: (binary) search for it */
unsigned int i = 0;
while (j - i > 1) {
unsigned int m = (i+j)/2;
if (ttisnil(&t->array[m - 1])) j = m;
else i = m;
}
return i;
}
/* else must find a boundary in hash part */
else if (t->node == dummynode) /* hash part is empty? */
return j; /* that is easy... */
else return unbound_search(t, j);
}
函数中分三种情况:
- table长度大于0,且最后一个元素是nil
- table的哈希表为空
- 其他情况
- 对于
tb1,sizearray为8,则数组部分是{1, 2, 3, 4, 5, nil, 7, nil}。
符合情况1,则二分查找Number和nil的分界,所以最终返回值是5。 - 对于
tb2,sizearray也是8,数组部分是{1, 2, 3, 4, 5, nil, nil, 8},最后一个元素不为nil,哈希表部分为空。
符合情况2,所以直接返回sizearray即8。 - 对于
tb3,sizearray为8,索引9在哈希表部分,数组部分与tb2相同。
符合情况3,调用int unbound_search (Table *t, unsigned int j):
// ltable.c
static int unbound_search (Table *t, unsigned int j) {
unsigned int i = j; /* i is zero or a present index */
j++;
/* find `i' and `j' such that i is present and j is not */
while (!ttisnil(luaH_getnum(t, j))) {
i = j;
j *= 2;
if (j > cast(unsigned int, MAX_INT)) { /* overflow? */
/* table was built with bad purposes: resort to linear search */
i = 1;
while (!ttisnil(luaH_getnum(t, i))) i++;
return i - 1;
}
}
/* now do a binary search between them */
while (j - i > 1) {
unsigned int m = (i+j)/2;
if (ttisnil(luaH_getnum(t, m))) j = m;
else i = m;
}
return i;
}
此函数会在数组部分以外的哈希表里寻找连续的数字索引,搜索边界每次扩大为之前的2倍,再二分查找Number索引与nil的分界。
先找tb3[9],不为nil。继续看tb3[18],是nil,则在tb3[9] - tb3[18]内进行二分查找,所以最终返回9。
不过这种查找方法并不准确,例如:
tb3 = {1, 2, 3, 4, 5}
tb3[8] = 8
tb3[9] = 9
tb3[13] = 13
print("tb3 length: "..#tb3)
-- tb3 length: 13
索引9和13都在哈希表部分,返回长度是13。这是因为在tb3[9] - tb3[18]二分查找的时候,先判断tb3[(9 + 18) / 2]不为nil,所以就在tb3[13] - tb3[18]范围内去查找了,最后找到的分界线是13,直接忽略掉了前半部分为nil的元素。
sizearray为什么是8?
在lvm中,创建table对应的操作码是OP_NEWTABLE:
// lvm.c
case OP_NEWTABLE: {
int b = GETARG_B(i); // 数组元素个数
int c = GETARG_C(i); // 哈希表元素个数
sethvalue(L, ra, luaH_new(L, luaO_fb2int(b), luaO_fb2int(c)));
Protect(luaC_checkGC(L));
continue;
}
b是数组部分元素个数,c是哈希表部分元素个数,调用Table *luaH_new (lua_State *L, int narray, int nhash)创建新的table:
// ltable.c
Table *luaH_new (lua_State *L, int narray, int nhash) {
Table *t = luaM_new(L, Table);
luaC_link(L, obj2gco(t), LUA_TTABLE);
t->metatable = NULL;
t->flags = cast_byte(~0);
/* temporary values (kept only if some malloc fails) */
t->array = NULL;
t->sizearray = 0;
t->lsizenode = 0;
t->node = cast(Node *, dummynode);
setarrayvector(L, t, narray);
setnodevector(L, t, nhash);
return t;
}
static void setarrayvector (lua_State *L, Table *t, int size) {
int i;
luaM_reallocvector(L, t->array, t->sizearray, size, TValue);
for (i=t->sizearray; i<size; i++)
setnilvalue(&t->array[i]);
t->sizearray = size;
}
可以看到,在创建table的时候,就确定了数组和哈希表的长度,数组元素使用nil填充。
对于lua语句tb1 = {1, 2, 3, 4, 5},执行完后sizearray就是5。
那么执行完tb1[7] = 7后,sizearray为什么又变成8了呢?
在lvm里,给table赋值一般有两种方式:
- 初始化列表,对应操作码
OP_SETLIST - 按索引赋值,对应操作码
OP_SETTABLE
上面的语句就对应着第二种方式:
// lvm.c
case OP_SETTABLE: {
Protect(luaV_settable(L, ra, RKB(i), RKC(i)));
continue;
}
void luaV_settable (lua_State *L, const TValue *t, TValue *key, StkId val) {
int loop;
TValue temp;
for (loop = 0; loop < MAXTAGLOOP; loop++) {
const TValue *tm;
if (ttistable(t)) { /* `t' is a table? */
Table *h = hvalue(t);
TValue *oldval = luaH_set(L, h, key); /* do a primitive set */
if (!ttisnil(oldval) || /* result is no nil? */
(tm = fasttm(L, h->metatable, TM_NEWINDEX)) == NULL) { /* or no TM? */
setobj2t(L, oldval, val);
h->flags = 0;
luaC_barriert(L, h, val);
return;
}
/* else will try the tag method */
}
else if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_NEWINDEX)))
luaG_typeerror(L, t, "index");
if (ttisfunction(tm)) {
callTM(L, tm, t, key, val);
return;
}
/* else repeat with `tm' */
setobj(L, &temp, tm); /* avoid pointing inside table (may rehash) */
t = &temp;
}
luaG_runerror(L, "loop in settable");
}
对于table,会继续调用TValue *luaH_set (lua_State *L, Table *t, const TValue *key):
// ltable.c
TValue *luaH_set (lua_State *L, Table *t, const TValue *key) {
const TValue *p = luaH_get(t, key);
t->flags = 0;
if (p != luaO_nilobject)
return cast(TValue *, p);
else {
if (ttisnil(key)) luaG_runerror(L, "table index is nil");
else if (ttisnumber(key) && luai_numisnan(nvalue(key)))
luaG_runerror(L, "table index is NaN");
return newkey(L, t, key);
}
}
如果要赋值的这个key不存在,会继续调用TValue *newkey (lua_State *L, Table *t, const TValue *key):
// ltable.c
/*
** inserts a new key into a hash table; first, check whether key's main
** position is free. If not, check whether colliding node is in its main
** position or not: if it is not, move colliding node to an empty place and
** put new key in its main position; otherwise (colliding node is in its main
** position), new key goes to an empty position.
*/
static TValue *newkey (lua_State *L, Table *t, const TValue *key) {
Node *mp = mainposition(t, key);
if (!ttisnil(gval(mp)) || mp == dummynode) {
Node *othern;
Node *n = getfreepos(t); /* get a free place */
if (n == NULL) { /* cannot find a free place? */
rehash(L, t, key); /* grow table */
return luaH_set(L, t, key); /* re-insert key into grown table */
}
lua_assert(n != dummynode);
othern = mainposition(t, key2tval(mp));
if (othern != mp) { /* is colliding node out of its main position? */
/* yes; move colliding node into free position */
while (gnext(othern) != mp) othern = gnext(othern); /* find previous */
gnext(othern) = n; /* redo the chain with `n' in place of `mp' */
*n = *mp; /* copy colliding node into free pos. (mp->next also goes) */
gnext(mp) = NULL; /* now `mp' is free */
setnilvalue(gval(mp));
}
else { /* colliding node is in its own main position */
/* new node will go into free position */
gnext(n) = gnext(mp); /* chain new position */
gnext(mp) = n;
mp = n;
}
}
gkey(mp)->value = key->value; gkey(mp)->tt = key->tt;
luaC_barriert(L, t, key);
lua_assert(ttisnil(gval(mp)));
return gval(mp);
}
由于tb1的哈希表为空,会调用void rehash (lua_State *L, Table *t, const TValue *ek):
// ltable.c
static void rehash (lua_State *L, Table *t, const TValue *ek) {
int nasize, na;
int nums[MAXBITS+1]; /* nums[i] = number of keys between 2^(i-1) and 2^i */
int i;
int totaluse;
for (i=0; i<=MAXBITS; i++) nums[i] = 0; /* reset counts */
nasize = numusearray(t, nums); /* count keys in array part */
totaluse = nasize; /* all those keys are integer keys */
totaluse += numusehash(t, nums, &nasize); /* count keys in hash part */
/* count extra key */
nasize += countint(ek, nums);
totaluse++;
/* compute new size for array part */
na = computesizes(nums, &nasize);
/* resize the table to new computed sizes */
resize(L, t, nasize, totaluse - na);
}
这个函数会重新分配table的数组与哈希表的内存空间。
numusearray(t, nums)统计数组部分元素个数,numusehash(t, nums, &nasize)统计哈希表部分key为正整数的元素个数,nasize是最终数组部分要分配的空间长度,na是最终数组部分不为nil的元素个数。
重点看一下函数int computesizes (int nums[], int *narray):
// ltable.c
static int computesizes (int nums[], int *narray) {
int i;
int twotoi; /* 2^i */
int a = 0; /* number of elements smaller than 2^i */
int na = 0; /* number of elements to go to array part */
int n = 0; /* optimal size for array part */
for (i = 0, twotoi = 1; twotoi/2 < *narray; i++, twotoi *= 2) {
if (nums[i] > 0) {
a += nums[i];
if (a > twotoi/2) { /* more than half elements present? */
n = twotoi; /* optimal size (till now) */
na = a; /* all elements smaller than n will go to array part */
}
}
if (a == *narray) break; /* all elements already counted */
}
*narray = n;
lua_assert(*narray/2 <= na && na <= *narray);
return na;
}
这里统计数组部分的长度和元素的个数,每次循环容量扩大为之前的两倍,a记录不为nil的元素个数,n表示数组长度。
if (a > twotoi/2)当元素个数大于长度的一半,即数组利用率高于50%的时候,才会扩充数组容量,最终保证数组的利用率高于50%。
所以tb1[7] = 7后,sizearray变为8也就可以理解了:
数组内有6个非nil元素,数组长度在循环中1->2->4->8->16,发现数组利用率低于了50%,保留8作为数组长度。取得元素个数和数组长度后,在void resize (lua_State *L, Table *t, int nasize, int nhsize)中赋值给sizearray,最终的tb1就变成了{1, 2, 3, 4, 5, nil, 7, nil}。
tb2与tb1同理,那tb3又是什么情况?
tb3比tb2多了一个索引9,在计算数组长度的时候,发现数组利用率7 / 16是低于50%的,所以保留8作为数组长度,而多出来的索引9就放在了哈希表中。
好像还漏掉了什么
前面讲了tb1 tb2 tb3,还没有说到tb4。
由上面可知,tb4的sizearray为5,在int luaH_getn (Table *t)里符合情况2,所以直接返回sizearray。
在执行tb4[2] = nil和tb4[3] = nil的时候,由于对应位置不为nil,所以没有触发newkey和rehash,所以中间有nil值并没有影响到table的长度。
tb4似乎也并没有什么特别之处。
之所以要单独列出来,是要说一下table的动态内存管理机制。
// ltable.c
static void resize (lua_State *L, Table *t, int nasize, int nhsize) {
int i;
int oldasize = t->sizearray;
int oldhsize = t->lsizenode;
Node *nold = t->node; /* save old hash ... */
if (nasize > oldasize) /* array part must grow? */
setarrayvector(L, t, nasize);
/* create new hash part with appropriate size */
setnodevector(L, t, nhsize);
if (nasize < oldasize) { /* array part must shrink? */
t->sizearray = nasize;
/* re-insert elements from vanishing slice */
for (i=nasize; i<oldasize; i++) {
if (!ttisnil(&t->array[i]))
setobjt2t(L, luaH_setnum(L, t, i+1), &t->array[i]);
}
/* shrink array */
luaM_reallocvector(L, t->array, oldasize, nasize, TValue);
}
/* re-insert elements from hash part */
for (i = twoto(oldhsize) - 1; i >= 0; i--) {
Node *old = nold+i;
if (!ttisnil(gval(old)))
setobjt2t(L, luaH_set(L, t, key2tval(old)), gval(old));
}
if (nold != dummynode)
luaM_freearray(L, nold, twoto(oldhsize), Node); /* free old array */
}
在rehash中确定了数组部分长度和哈希表部分长度后,就会调用void resize (lua_State *L, Table *t, int nasize, int nhsize)重新分配内存。
可以看到,这个过程要进行大量的内存操作,是很费效率的,所以table只有在空间不够的情况下才会rehash,而将已有的元素置为nil并不会触发。
https://blog.csdn.net/fujia_jzyl/article/details/88213815
原创不易,转载请注明出处,谢谢!