Lua源码阅读笔记 - table的长度

table数据结构

首先看一下lua中table的数据结构:

// lobject.h

/*
** Tables
*/

typedef union TKey {
  struct {
    TValuefields;
    struct Node *next;  /* for chaining */
  } nk;
  TValue tvk;
} TKey;


typedef struct Node {
  TValue i_val;
  TKey i_key;
} Node;


typedef struct Table {
  CommonHeader;
  lu_byte flags;  /* 1<<p means tagmethod(p) is not present */ 
  lu_byte lsizenode;  /* log2 of size of `node' array */
  struct Table *metatable;
  TValue *array;  /* array part */
  Node *node;
  Node *lastfree;  /* any free position is before this position */
  GCObject *gclist;
  int sizearray;  /* size of `array' array */
} Table;

table由两部分组成:

  1. array是数组头,sizearray是数组长度。
  2. node是哈希表头,lsizenode是以2为底哈希表长度的对数,即 2lsizenode 为哈希表长度。

table的长度

执行如下lua代码:

tb1 = {1, 2, 3, 4, 5}
tb1[7] = 7
print("tb1 length: "..#tb1)
-- tb1 length: 5

tb2 = {1, 2, 3, 4, 5}
tb2[8] = 8
print("tb2 length: "..#tb2)
-- tb2 length: 8

tb3 = {1, 2, 3, 4, 5}
tb3[8] = 8
tb3[9] = 9
print("tb3 length: "..#tb3)
-- tb3 length: 9

tb4 = {1, 2, 3, 4, 5}
tb4[2] = nil
tb4[3] = nil
print("tb4 length: "..#tb4)
-- tb4 length: 5

tb1长度5,tb2长度8,tb3长度9,tb4长度5。
为什么会这样呢?可以找一下lua中#获取table长度的源代码。
在 lvm.c 文件中的函数void luaV_execute (lua_State *L, int nexeccalls)内,我们可以通过调试找到#对应的操作码OP_LEN,发现#获取table长度即是调用了int luaH_getn (Table *t)

// ltable.c

/*
** Try to find a boundary in table `t'. A `boundary' is an integer index
** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil).
*/
int luaH_getn (Table *t) {
  unsigned int j = t->sizearray;
  if (j > 0 && ttisnil(&t->array[j - 1])) {
    /* there is a boundary in the array part: (binary) search for it */
    unsigned int i = 0;
    while (j - i > 1) {
      unsigned int m = (i+j)/2;
      if (ttisnil(&t->array[m - 1])) j = m;
      else i = m;
    }
    return i;
  }
  /* else must find a boundary in hash part */
  else if (t->node == dummynode)  /* hash part is empty? */
    return j;  /* that is easy... */
  else return unbound_search(t, j);
}

函数中分三种情况:

  1. table长度大于0,且最后一个元素是nil
  2. table的哈希表为空
  3. 其他情况
  • 对于tb1sizearray为8,则数组部分是{1, 2, 3, 4, 5, nil, 7, nil}
    符合情况1,则二分查找Number和nil的分界,所以最终返回值是5。
  • 对于tb2sizearray也是8,数组部分是{1, 2, 3, 4, 5, nil, nil, 8},最后一个元素不为nil,哈希表部分为空。
    符合情况2,所以直接返回sizearray即8。
  • 对于tb3sizearray为8,索引9在哈希表部分,数组部分与tb2相同。
    符合情况3,调用int unbound_search (Table *t, unsigned int j)
// ltable.c

static int unbound_search (Table *t, unsigned int j) {
  unsigned int i = j;  /* i is zero or a present index */
  j++;
  /* find `i' and `j' such that i is present and j is not */
  while (!ttisnil(luaH_getnum(t, j))) {
    i = j;
    j *= 2;
    if (j > cast(unsigned int, MAX_INT)) {  /* overflow? */
      /* table was built with bad purposes: resort to linear search */
      i = 1;
      while (!ttisnil(luaH_getnum(t, i))) i++;
      return i - 1;
    }
  }
  /* now do a binary search between them */
  while (j - i > 1) {
    unsigned int m = (i+j)/2;
    if (ttisnil(luaH_getnum(t, m))) j = m;
    else i = m;
  }
  return i;
}

此函数会在数组部分以外的哈希表里寻找连续的数字索引,搜索边界每次扩大为之前的2倍,再二分查找Number索引与nil的分界。
先找tb3[9],不为nil。继续看tb3[18],是nil,则在tb3[9] - tb3[18]内进行二分查找,所以最终返回9。
不过这种查找方法并不准确,例如:

tb3 = {1, 2, 3, 4, 5}
tb3[8] = 8
tb3[9] = 9
tb3[13] = 13
print("tb3 length: "..#tb3)
-- tb3 length: 13

索引9和13都在哈希表部分,返回长度是13。这是因为在tb3[9] - tb3[18]二分查找的时候,先判断tb3[(9 + 18) / 2]不为nil,所以就在tb3[13] - tb3[18]范围内去查找了,最后找到的分界线是13,直接忽略掉了前半部分为nil的元素。


sizearray为什么是8?

在lvm中,创建table对应的操作码是OP_NEWTABLE

// lvm.c

case OP_NEWTABLE: {
  int b = GETARG_B(i); // 数组元素个数
  int c = GETARG_C(i); // 哈希表元素个数
  sethvalue(L, ra, luaH_new(L, luaO_fb2int(b), luaO_fb2int(c)));
  Protect(luaC_checkGC(L));
  continue;
}

b是数组部分元素个数,c是哈希表部分元素个数,调用Table *luaH_new (lua_State *L, int narray, int nhash)创建新的table:

// ltable.c

Table *luaH_new (lua_State *L, int narray, int nhash) {
  Table *t = luaM_new(L, Table);
  luaC_link(L, obj2gco(t), LUA_TTABLE);
  t->metatable = NULL;
  t->flags = cast_byte(~0);
  /* temporary values (kept only if some malloc fails) */
  t->array = NULL;
  t->sizearray = 0;
  t->lsizenode = 0;
  t->node = cast(Node *, dummynode);
  setarrayvector(L, t, narray);
  setnodevector(L, t, nhash);
  return t;
}

static void setarrayvector (lua_State *L, Table *t, int size) {
  int i;
  luaM_reallocvector(L, t->array, t->sizearray, size, TValue);
  for (i=t->sizearray; i<size; i++)
     setnilvalue(&t->array[i]);
  t->sizearray = size;
}

可以看到,在创建table的时候,就确定了数组和哈希表的长度,数组元素使用nil填充。
对于lua语句tb1 = {1, 2, 3, 4, 5},执行完后sizearray就是5。
那么执行完tb1[7] = 7后,sizearray为什么又变成8了呢?

在lvm里,给table赋值一般有两种方式:

  1. 初始化列表,对应操作码OP_SETLIST
  2. 按索引赋值,对应操作码OP_SETTABLE

上面的语句就对应着第二种方式:

// lvm.c

case OP_SETTABLE: {
  Protect(luaV_settable(L, ra, RKB(i), RKC(i)));
  continue;
}

void luaV_settable (lua_State *L, const TValue *t, TValue *key, StkId val) {
  int loop;
  TValue temp;
  for (loop = 0; loop < MAXTAGLOOP; loop++) {
    const TValue *tm;
    if (ttistable(t)) {  /* `t' is a table? */
      Table *h = hvalue(t);
      TValue *oldval = luaH_set(L, h, key); /* do a primitive set */
      if (!ttisnil(oldval) ||  /* result is no nil? */
          (tm = fasttm(L, h->metatable, TM_NEWINDEX)) == NULL) { /* or no TM? */
        setobj2t(L, oldval, val);
        h->flags = 0;
        luaC_barriert(L, h, val);
        return;
      }
      /* else will try the tag method */
    }
    else if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_NEWINDEX)))
      luaG_typeerror(L, t, "index");
    if (ttisfunction(tm)) {
      callTM(L, tm, t, key, val);
      return;
    }
    /* else repeat with `tm' */
    setobj(L, &temp, tm);  /* avoid pointing inside table (may rehash) */
    t = &temp;
  }
  luaG_runerror(L, "loop in settable");
}

对于table,会继续调用TValue *luaH_set (lua_State *L, Table *t, const TValue *key)

// ltable.c

TValue *luaH_set (lua_State *L, Table *t, const TValue *key) {
  const TValue *p = luaH_get(t, key);
  t->flags = 0;
  if (p != luaO_nilobject)
    return cast(TValue *, p);
  else {
    if (ttisnil(key)) luaG_runerror(L, "table index is nil");
    else if (ttisnumber(key) && luai_numisnan(nvalue(key)))
      luaG_runerror(L, "table index is NaN");
    return newkey(L, t, key);
  }
}

如果要赋值的这个key不存在,会继续调用TValue *newkey (lua_State *L, Table *t, const TValue *key)

// ltable.c

/*
** inserts a new key into a hash table; first, check whether key's main 
** position is free. If not, check whether colliding node is in its main 
** position or not: if it is not, move colliding node to an empty place and 
** put new key in its main position; otherwise (colliding node is in its main 
** position), new key goes to an empty position. 
*/
static TValue *newkey (lua_State *L, Table *t, const TValue *key) {
  Node *mp = mainposition(t, key);
  if (!ttisnil(gval(mp)) || mp == dummynode) {
    Node *othern;
    Node *n = getfreepos(t);  /* get a free place */
    if (n == NULL) {  /* cannot find a free place? */
      rehash(L, t, key);  /* grow table */
      return luaH_set(L, t, key);  /* re-insert key into grown table */
    }
    lua_assert(n != dummynode);
    othern = mainposition(t, key2tval(mp));
    if (othern != mp) {  /* is colliding node out of its main position? */
      /* yes; move colliding node into free position */
      while (gnext(othern) != mp) othern = gnext(othern);  /* find previous */
      gnext(othern) = n;  /* redo the chain with `n' in place of `mp' */
      *n = *mp;  /* copy colliding node into free pos. (mp->next also goes) */
      gnext(mp) = NULL;  /* now `mp' is free */
      setnilvalue(gval(mp));
    }
    else {  /* colliding node is in its own main position */
      /* new node will go into free position */
      gnext(n) = gnext(mp);  /* chain new position */
      gnext(mp) = n;
      mp = n;
    }
  }
  gkey(mp)->value = key->value; gkey(mp)->tt = key->tt;
  luaC_barriert(L, t, key);
  lua_assert(ttisnil(gval(mp)));
  return gval(mp);
}

由于tb1的哈希表为空,会调用void rehash (lua_State *L, Table *t, const TValue *ek)

// ltable.c

static void rehash (lua_State *L, Table *t, const TValue *ek) {
  int nasize, na;
  int nums[MAXBITS+1];  /* nums[i] = number of keys between 2^(i-1) and 2^i */
  int i;
  int totaluse;
  for (i=0; i<=MAXBITS; i++) nums[i] = 0;  /* reset counts */
  nasize = numusearray(t, nums);  /* count keys in array part */
  totaluse = nasize;  /* all those keys are integer keys */
  totaluse += numusehash(t, nums, &nasize);  /* count keys in hash part */
  /* count extra key */
  nasize += countint(ek, nums);
  totaluse++;
  /* compute new size for array part */
  na = computesizes(nums, &nasize);
  /* resize the table to new computed sizes */
  resize(L, t, nasize, totaluse - na);
}

这个函数会重新分配table的数组与哈希表的内存空间。
numusearray(t, nums)统计数组部分元素个数,numusehash(t, nums, &nasize)统计哈希表部分key为正整数的元素个数,nasize是最终数组部分要分配的空间长度,na是最终数组部分不为nil的元素个数。

重点看一下函数int computesizes (int nums[], int *narray)

// ltable.c

static int computesizes (int nums[], int *narray) {
  int i;
  int twotoi;  /* 2^i */
  int a = 0;  /* number of elements smaller than 2^i */
  int na = 0;  /* number of elements to go to array part */
  int n = 0;  /* optimal size for array part */
  for (i = 0, twotoi = 1; twotoi/2 < *narray; i++, twotoi *= 2) {
    if (nums[i] > 0) {
      a += nums[i];
      if (a > twotoi/2) {  /* more than half elements present? */
        n = twotoi;  /* optimal size (till now) */
        na = a;  /* all elements smaller than n will go to array part */
      }
    }
    if (a == *narray) break;  /* all elements already counted */
  }
  *narray = n;
  lua_assert(*narray/2 <= na && na <= *narray);
  return na;
}

这里统计数组部分的长度和元素的个数,每次循环容量扩大为之前的两倍,a记录不为nil的元素个数,n表示数组长度。
if (a > twotoi/2)当元素个数大于长度的一半,即数组利用率高于50%的时候,才会扩充数组容量,最终保证数组的利用率高于50%。

所以tb1[7] = 7后,sizearray变为8也就可以理解了:
数组内有6个非nil元素,数组长度在循环中1->2->4->8->16,发现数组利用率低于了50%,保留8作为数组长度。取得元素个数和数组长度后,在void resize (lua_State *L, Table *t, int nasize, int nhsize)中赋值给sizearray,最终的tb1就变成了{1, 2, 3, 4, 5, nil, 7, nil}

tb2tb1同理,那tb3又是什么情况?
tb3tb2多了一个索引9,在计算数组长度的时候,发现数组利用率7 / 16是低于50%的,所以保留8作为数组长度,而多出来的索引9就放在了哈希表中。


好像还漏掉了什么

前面讲了tb1 tb2 tb3,还没有说到tb4

由上面可知,tb4的sizearray为5,在int luaH_getn (Table *t)里符合情况2,所以直接返回sizearray
在执行tb4[2] = niltb4[3] = nil的时候,由于对应位置不为nil,所以没有触发newkeyrehash,所以中间有nil值并没有影响到table的长度。
tb4似乎也并没有什么特别之处。

之所以要单独列出来,是要说一下table的动态内存管理机制。

// ltable.c

static void resize (lua_State *L, Table *t, int nasize, int nhsize) {
  int i;
  int oldasize = t->sizearray;
  int oldhsize = t->lsizenode;
  Node *nold = t->node;  /* save old hash ... */
  if (nasize > oldasize)  /* array part must grow? */
    setarrayvector(L, t, nasize);
  /* create new hash part with appropriate size */
  setnodevector(L, t, nhsize);  
  if (nasize < oldasize) {  /* array part must shrink? */
    t->sizearray = nasize;
    /* re-insert elements from vanishing slice */
    for (i=nasize; i<oldasize; i++) {
      if (!ttisnil(&t->array[i]))
        setobjt2t(L, luaH_setnum(L, t, i+1), &t->array[i]);
    }
    /* shrink array */
    luaM_reallocvector(L, t->array, oldasize, nasize, TValue);
  }
  /* re-insert elements from hash part */
  for (i = twoto(oldhsize) - 1; i >= 0; i--) {
    Node *old = nold+i;
    if (!ttisnil(gval(old)))
      setobjt2t(L, luaH_set(L, t, key2tval(old)), gval(old));
  }
  if (nold != dummynode)
    luaM_freearray(L, nold, twoto(oldhsize), Node);  /* free old array */
}

rehash中确定了数组部分长度和哈希表部分长度后,就会调用void resize (lua_State *L, Table *t, int nasize, int nhsize)重新分配内存。

可以看到,这个过程要进行大量的内存操作,是很费效率的,所以table只有在空间不够的情况下才会rehash,而将已有的元素置为nil并不会触发。


https://blog.csdn.net/fujia_jzyl/article/details/88213815
原创不易,转载请注明出处,谢谢!


版权声明:本文为fujia_jzyl原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。