先看LUA中关于字符串TString的源码:
/* ** Header for string value; string bytes follow the end of this structure ** (aligned according to 'UTString'; see next). */ typedef struct TString { CommonHeader;//可GC对象的头 lu_byte extra; /* reserved words for short strings; "has hash" for longs */ //标记是否是虚拟机保留的字符串,如果是短字符串,1就是lua中保留的字符串(关键字),不可GC;长字符串1表示已经hash. lu_byte shrlen; /* length for short strings *///短字符串的长度 unsigned int hash;//字符串的hash值,字符串的比较可以通过hash值 union { size_t lnglen; /* length for long strings */ //长字符串的长度 struct TString *hnext; /* linked list for hash table *///指向下一个字符串 } u; } TString; /* ** Ensures that address after this type is always fully aligned. **L_Umaxalign是一个宏,用来保证UTString结构里的TString按照这个长度来对齐 */ typedef union UTString { L_Umaxalign dummy; /* ensures maximum alignment for strings */ TString tsv; } UTString;
extra值是1时表示是虚拟器中保留的字符串,也就是关键字,是不支持自动回收的,在GC过程中会略过对这个字符串的处理.
/* ORDER RESERVED */ static const char *const luaX_tokens [] = { "and", "break", "do", "else", "elseif", "end", "false", "for", "function", "goto", "if", "in", "local", "nil", "not", "or", "repeat", "return", "then", "true", "until", "while", "//", "..", "...", "==", ">=", "<=", "~=", "<<", ">>", "::", "<eof>", "<number>", "<integer>", "<name>", "<string>" };
创建新的字符串:
/* ** new string (with explicit length) */ TString *luaS_newlstr (lua_State *L, const char *str, size_t l) { if (l <= LUAI_MAXSHORTLEN) /* short string? */ return internshrstr(L, str, l); else { TString *ts; if (l >= (MAX_SIZE - sizeof(TString))/sizeof(char)) luaM_toobig(L); ts = luaS_createlngstrobj(L, l); memcpy(getstr(ts), str, l * sizeof(char)); return ts; } }
lua中字符串分为长字符串和短字符串,这两个处理方式略有不同. 当l<LUAI_MAXSHORTLEN,属于短字符串,默认是40.
先看短字符吧:
通过str和长度以及seed(种子)算出该str的hash值.根据hash值在全局字符串table中找到链表(短字符串存在全局表中global_state).如果长度和所有的字符都相同,就重复利用.否则需要重新生成. 从代码中看出如果链表的字符串对象的个数不小于size,就重新调用luaS_resize,空间是之前的2倍.
/* ** checks whether short string exists and reuses it or creates a new one */ static TString *internshrstr (lua_State *L, const char *str, size_t l) { TString *ts; global_State *g = G(L); unsigned int h = luaS_hash(str, l, g->seed); TString **list = &g->strt.hash[lmod(h, g->strt.size)]; lua_assert(str != NULL); /* otherwise 'memcmp'/'memcpy' are undefined */ for (ts = *list; ts != NULL; ts = ts->u.hnext) { if (l == ts->shrlen && (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) { /* found! */ if (isdead(g, ts)) /* dead (but not collected yet)? */ changewhite(ts); /* resurrect it */ return ts; } } if (g->strt.nuse >= g->strt.size && g->strt.size <= MAX_INT/2) { luaS_resize(L, g->strt.size * 2); list = &g->strt.hash[lmod(h, g->strt.size)]; /* recompute with new size */ } ts = createstrobj(L, l, LUA_TSHRSTR, h); memcpy(getstr(ts), str, l * sizeof(char)); ts->shrlen = cast_byte(l); ts->u.hnext = *list; *list = ts; g->strt.nuse++; return ts; }
createstrobj是真正创建lua字符串的函数:
/* ** creates a new string object */ static TString *createstrobj (lua_State *L, size_t l, int tag, unsigned int h) { TString *ts; GCObject *o; size_t totalsize; /* total size of TString object */ totalsize = sizelstring(l); o = luaC_newobj(L, tag, totalsize); ts = gco2ts(o); ts->hash = h; ts->extra = 0; getstr(ts)[l] = '\0'; /* ending 0 */ return ts; }
sizelstring(l)是宏,在l的基础上加上sizeof(UTString)+1, 这个1是为'\0'准备的. 宏gco2ts是将GCObject对象转换为TString对象.在extra的赋值上看到赋值为0,不是虚拟器需要保存的字符串.
这区别于之前的luaX_tokens,虚拟器保留的字符串.源码如下:
llex.c
void luaX_init (lua_State *L) { int i; TString *e = luaS_newliteral(L, LUA_ENV); /* create env name */ luaC_fix(L, obj2gco(e)); /* never collect this name */ for (i=0; i<NUM_RESERVED; i++) { TString *ts = luaS_new(L, luaX_tokens[i]); luaC_fix(L, obj2gco(ts)); /* reserved words are never collected */ ts->extra = cast_byte(i+1); /* reserved word */ } }
luaC_fix保证保留字符串不被gc
lgc.c
void luaC_fix (lua_State *L, GCObject *o) { global_State *g = G(L); lua_assert(g->allgc == o); /* object must be 1st in 'allgc' list! */ white2gray(o); /* they will be gray forever */ g->allgc = o->next; /* remove object from 'allgc' list */ o->next = g->fixedgc; /* link it to 'fixedgc' list */ g->fixedgc = o; }
fixedgc是不可gc的链表.
长字符串创建:
TString *luaS_createlngstrobj (lua_State *L, size_t l) { TString *ts = createstrobj(L, l, LUA_TLNGSTR, G(L)->seed); ts->u.lnglen = l; return ts; }
长字符串的长度是保存在联合结构体内的lnglen中,和短字符串不同.
接下来看看hash的取法:
unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) { unsigned int h = seed ^ cast(unsigned int, l); size_t step = (l >> LUAI_HASHLIMIT) + 1; for (; l >= step; l -= step) h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1])); return h; } unsigned int luaS_hashlongstr (TString *ts) { lua_assert(ts->tt == LUA_TLNGSTR); if (ts->extra == 0) { /* no hash? */ ts->hash = luaS_hash(getstr(ts), ts->u.lnglen, ts->hash); ts->extra = 1; /* now it has its hash */ } return ts->hash; }
从源码上看长字符串是属于惰性求hash值,如果已经错在hash值,就直接返回,不再重新求.
在求hash上, 最新版本新加了种子,全局表中的随机种子: g->seed = makeseed(L); 就是防止产生的hash值相同的太多,导致生成的链表过长,加大了查找和插入的时间.
lua中字符串hash用的是JSHash,关于字符串的各种hash函数,可以参考:
http://blog.csdn.net/u014269285/article/details/79518334
/* ** equality for short strings, which are always internalized */ #define eqshrstr(a,b) check_exp((a)->tt == LUA_TSHRSTR, (a) == (b))
短字符串的比较直接比较地址,因为在lua中相同的短字符串只会存在一份.
/* ** equality for long strings */ int luaS_eqlngstr (TString *a, TString *b) { size_t len = a->u.lnglen; lua_assert(a->tt == LUA_TLNGSTR && b->tt == LUA_TLNGSTR); return (a == b) || /* same instance or... */ ((len == b->u.lnglen) && /* equal length and ... */ (memcmp(getstr(a), getstr(b), len) == 0)); /* equal contents */ }长字符串需要先比较长度,在比较内容.