Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Utf8 #5

Open
wants to merge 5 commits into
base: v2.1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ LJVM_O= lj_vm.o
LJVM_BOUT= $(LJVM_S)
LJVM_MODE= elfasm

LJLIB_O= lib_base.o lib_math.o lib_bit.o lib_string.o lib_table.o \
LJLIB_O= lib_base.o lib_math.o lib_bit.o lib_string.o lib_utf8.o lib_table.o \
lib_io.o lib_os.o lib_package.o lib_debug.o lib_jit.o lib_ffi.o
LJLIB_C= $(LJLIB_O:.o=.c)

Expand Down
3 changes: 3 additions & 0 deletions src/Makefile.dep
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ lib_string.o: lib_string.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
lib_table.o: lib_table.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
lj_def.h lj_arch.h lj_gc.h lj_err.h lj_errmsg.h lj_buf.h lj_str.h \
lj_tab.h lj_ff.h lj_ffdef.h lj_lib.h lj_libdef.h
lib_utf8.o: lib_utf8.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
lj_def.h lj_arch.h lj_err.h lj_errmsg.h lj_buf.h lj_gc.h lj_str.h \
lj_lib.h lj_libdef.h
lj_alloc.o: lj_alloc.c lj_def.h lua.h luaconf.h lj_arch.h lj_alloc.h
lj_api.o: lj_api.c lj_obj.h lua.h luaconf.h lj_def.h lj_arch.h lj_gc.h \
lj_err.h lj_errmsg.h lj_debug.h lj_str.h lj_tab.h lj_func.h lj_udata.h \
Expand Down
1 change: 1 addition & 0 deletions src/lib_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ static const luaL_Reg lj_lib_load[] = {
{ LUA_IOLIBNAME, luaopen_io },
{ LUA_OSLIBNAME, luaopen_os },
{ LUA_STRLIBNAME, luaopen_string },
{ LUA_UTF8LIBNAME, luaopen_utf8 },
{ LUA_MATHLIBNAME, luaopen_math },
{ LUA_DBLIBNAME, luaopen_debug },
{ LUA_BITLIBNAME, luaopen_bit },
Expand Down
249 changes: 249 additions & 0 deletions src/lib_utf8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
/*
** UTF-8 library.
** Copyright (C) 2018.
**
** Major portions taken verbatim or adapted from the Lua interpreter.
** Copyright (C) 1994-2008 Lua.org, PUC-Rio. See Copyright Notice in lua.h
*/

#define lib_utf8_c
#define LUA_LIB

#include "lua.h"
#include "lauxlib.h"
#include "lualib.h"

#include "lj_obj.h"
#include "lj_err.h"
#include "lj_buf.h"
#include "lj_lib.h"


/* ------------------------------------------------------------------------ */

#define LJLIB_MODULE_utf8

#define MAXUNICODE 0x10FFFF
#define iscont(p) ((*(p) & 0xC0) == 0x80)

/*
** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
*/
static const char *utf8_decode (const char *o, int *val) {
static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
const unsigned char *s = (const unsigned char *)o;
unsigned int c = s[0];
unsigned int res = 0; /* final result */
if (c < 0x80) /* ascii? */
res = c;
else {
int count = 0; /* to count number of continuation bytes */
while (c & 0x40) { /* still have continuation bytes? */
int cc = s[++count]; /* read next byte */
if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
return NULL; /* invalid byte sequence */
res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
c <<= 1; /* to test next bit */
}
res |= ((c & 0x7F) << (count * 5)); /* add first byte */
if (count > 3 || res > MAXUNICODE || res <= limits[count])
return NULL; /* invalid byte sequence */
s += count; /* skip continuation bytes read */
}
if (val) *val = res;
return (const char *)s + 1; /* +1 to include first byte */
}


LJLIB_CF(utf8_char)
{
int i, nargs = (int)(L->top - L->base);
SBuf *sb = lj_buf_tmp_(L);
for (i = 1; i <= nargs; i++) {
int32_t k = lj_lib_checkint(L, i);
if (!checku32(k))
lj_err_arg(L, i, LJ_ERR_BADVAL);
lj_buf_pututf8(sb, k);
}
setstrV(L, L->top-1, lj_buf_str(L, sb));
lj_gc_check(L);
return 1;
}


/*
** utf8len(s [, i [, j]]) --> number of characters that start in the
** range [i,j], or nil + current position if 's' is not well formed in
** that interval
*/
LJLIB_CF(utf8_len)
{
int n = 0;
GCstr *str = lj_lib_checkstr(L, 1);
int32_t len = (int32_t)str->len;
int32_t posi = lj_lib_optint(L, 2, 1);
int32_t posj = lj_lib_optint(L, 3, -1);

if (posj < 0) posj += len+1;
if (posi < 0) posi += len+1;

luaL_argcheck(L, 1 <= posi && posi <= len+1, 2,
"initial position out of string");
luaL_argcheck(L, posj <= len, 3,
"final position out of string");

const char *s = strdata(str);
const char *p = s + posi-1;
const char *stop = s+posj;

while (p < stop) {
const char *nextp = utf8_decode(p, NULL);
if (nextp == NULL) { /* conversion error? */
lua_pushnil(L); /* return nil ... */
lua_pushinteger(L, p - s + 1); /* ... and current position */
return 2;
}
p = nextp;
n++;
}
lua_pushinteger(L, n);
return 1;
}



static int iter_aux (lua_State *L) {
size_t len;
const char *s = luaL_checklstring(L, 1, &len);
lua_Integer n = lua_tointeger(L, 2) - 1;
if (n < 0) /* first iteration? */
n = 0; /* start from here */
else if (n < (lua_Integer)len) {
n++; /* skip current byte */
while (iscont(s + n)) n++; /* and its continuations */
}
if (n >= (lua_Integer)len)
return 0; /* no more codepoints */
else {
int code;
const char *next = utf8_decode(s + n, &code);
if (next == NULL || iscont(next))
return luaL_error(L, "invalid UTF-8 code");
lua_pushinteger(L, n + 1);
lua_pushinteger(L, code);
return 2;
}
}


LJLIB_CF(utf8_codes)
{
luaL_checkstring(L, 1);
lua_pushcfunction(L, iter_aux);
lua_pushvalue(L, 1);
lua_pushinteger(L, 0);
return 3;
}



/*
** codepoint(s, [i, [j]]) -> returns codepoints for all characters
** that start in the range [i,j]
*/
LJLIB_CF(utf8_codepoint)
{
GCstr *str = lj_lib_checkstr(L, 1);
int32_t len = str->len;
int32_t posi = lj_lib_optint(L, 2, 1);
int32_t posj = lj_lib_optint(L, 3, posi);

if (posj < 0) posj += len+1;
if (posi < 0) posi += len+1;

if (posi > posj) return 0;

luaL_argcheck(L, 1 <= posi && posi <= len, 2,
"initial position out of string");
luaL_argcheck(L, posj <= len, 3,
"final position out of string");

luaL_checkstack(L, posj - posi + 1, "string slice too long");
int n = 0;
const char *s = strdata(str);
const char *se = s + posj;

for (s += posi - 1; s < se;) {
int code;
s = utf8_decode(s, &code);
if (s == NULL)
return luaL_error(L, "invalid UTF-8 code");
lua_pushinteger(L, code);
n++;
}
return n;
}


/*
** offset(s, n, [i]) -> index where n-th character counting from
** position 'i' starts; 0 means character at 'i'.
*/
LJLIB_CF(utf8_offset)
{
GCstr *str = lj_lib_checkstr(L, 1);
int32_t len = str->len;
int32_t n = lj_lib_checkint(L, 2);
int16_t posi = lj_lib_optint(L, 3, (n >= 0) ? 1 : len + 1);
luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
"position out of range");

const char *s = strdata(str);

if (n == 0) {
/* find beginning of current byte sequence */
while (posi > 0 && iscont(s + posi)) posi--;
}
else {
if (iscont(s + posi))
luaL_error(L, "initial position is a continuation byte");
if (n < 0) {
while (n < 0 && posi > 0) { /* move back */
do { /* find beginning of previous character */
posi--;
} while (posi > 0 && iscont(s + posi));
n++;
}
}
else {
n--; /* do not move for 1st character */
while (n > 0 && posi < (lua_Integer)len) {
do { /* find beginning of next character */
posi++;
} while (iscont(s + posi)); /* (cannot pass final '\0') */
n--;
}
}
}
if (n == 0) /* did it find given character? */
lua_pushinteger(L, posi + 1);
else /* no such character */
lua_pushnil(L);
return 1;
}


/* ------------------------------------------------------------------------ */

#include "lj_libdef.h"

/* pattern to match a single UTF-8 character */
#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"

LUALIB_API int luaopen_utf8(lua_State *L)
{
LJ_LIB_REG(L, LUA_UTF8LIBNAME, utf8);
lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1);
lua_setfield(L, -2, "charpattern");
return 1;
}
19 changes: 19 additions & 0 deletions src/lj_buf.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,25 @@ SBuf * LJ_FASTCALL lj_buf_putstr(SBuf *sb, GCstr *s)
return sb;
}

SBuf * lj_buf_pututf8(SBuf* sb, uint32_t x)
{
if (x < 0x800) {
if (x < 0x80)
return lj_buf_putchar(sb, x);
lj_buf_putchar(sb, 0xc0 | (x >> 6));
} else {
if (x >= 0x10000) {
lj_buf_putchar(sb, 0xf0 | (x >> 18));
lj_buf_putchar(sb, 0x80 | ((x >> 12) & 0x3f));
} else {
// if (x >= 0xd800 && x < 0xe000) goto err_utf8;
lj_buf_putchar(sb, 0xe0 | (x >> 12));
}
lj_buf_putchar(sb, 0x80 | ((x >> 6) & 0x3f));
}
return lj_buf_putchar(sb, 0x80 | (x & 0x3f));
}

/* -- High-level buffer put operations ------------------------------------ */

SBuf * LJ_FASTCALL lj_buf_putstr_reverse(SBuf *sb, GCstr *s)
Expand Down
1 change: 1 addition & 0 deletions src/lj_buf.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ static LJ_AINLINE char *lj_buf_more(SBuf *sb, MSize sz)
LJ_FUNC SBuf *lj_buf_putmem(SBuf *sb, const void *q, MSize len);
LJ_FUNC SBuf * LJ_FASTCALL lj_buf_putchar(SBuf *sb, int c);
LJ_FUNC SBuf * LJ_FASTCALL lj_buf_putstr(SBuf *sb, GCstr *s);
LJ_FUNC SBuf * LJ_FASTCALL lj_buf_pututf8(SBuf *sb, uint32_t x);

static LJ_AINLINE char *lj_buf_wmem(char *p, const void *q, MSize len)
{
Expand Down
5 changes: 4 additions & 1 deletion src/lj_strfmt.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

static const uint8_t strfmt_map[('x'-'A')+1] = {
STRFMT_A,0,0,0,STRFMT_E,STRFMT_F,STRFMT_G,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,STRFMT_X,0,0,
0,0,0,0,0,0,0,STRFMT_UTF,0,0,STRFMT_X,0,0,
0,0,0,0,0,0,
STRFMT_A,0,STRFMT_C,STRFMT_D,STRFMT_E,STRFMT_F,STRFMT_G,0,STRFMT_I,0,0,0,0,
0,STRFMT_O,STRFMT_P,STRFMT_Q,0,STRFMT_S,0,STRFMT_U,0,0,STRFMT_X
Expand Down Expand Up @@ -443,6 +443,9 @@ const char *lj_strfmt_pushvf(lua_State *L, const char *fmt, va_list argp)
case STRFMT_CHAR:
lj_buf_putb(sb, va_arg(argp, int));
break;
case STRFMT_UTF:
lj_buf_pututf8(sb, va_arg(argp, long int));
break;
case STRFMT_PTR:
lj_strfmt_putptr(sb, va_arg(argp, void *));
break;
Expand Down
2 changes: 1 addition & 1 deletion src/lj_strfmt.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ typedef struct FormatState {
/* Format types (max. 16). */
typedef enum FormatType {
STRFMT_EOF, STRFMT_ERR, STRFMT_LIT,
STRFMT_INT, STRFMT_UINT, STRFMT_NUM, STRFMT_STR, STRFMT_CHAR, STRFMT_PTR
STRFMT_INT, STRFMT_UINT, STRFMT_NUM, STRFMT_STR, STRFMT_CHAR, STRFMT_PTR, STRFMT_UTF
} FormatType;

/* Format subtypes (bits are reused). */
Expand Down
2 changes: 2 additions & 0 deletions src/lualib.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#define LUA_COLIBNAME "coroutine"
#define LUA_MATHLIBNAME "math"
#define LUA_STRLIBNAME "string"
#define LUA_UTF8LIBNAME "utf8"
#define LUA_TABLIBNAME "table"
#define LUA_IOLIBNAME "io"
#define LUA_OSLIBNAME "os"
Expand All @@ -25,6 +26,7 @@
LUALIB_API int luaopen_base(lua_State *L);
LUALIB_API int luaopen_math(lua_State *L);
LUALIB_API int luaopen_string(lua_State *L);
LUALIB_API int luaopen_utf8(lua_State *L);
LUALIB_API int luaopen_table(lua_State *L);
LUALIB_API int luaopen_io(lua_State *L);
LUALIB_API int luaopen_os(lua_State *L);
Expand Down
6 changes: 6 additions & 0 deletions tests/LuaJIT-test-cleanup/test/lang/concat.lua
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,9 @@ do --- Very long strings
assert(s:sub(1, 6) == s:sub(-6, -1))
end
end

do --- UTF-8 hexcodes in strings
local x
for i=1,100 do x = '\u{4f6e} - ' .. '\u{d61} - \u{4f1} - \u{a5}' end
assert(x == '佮 - ൡ - ӱ - ¥')
end
3 changes: 2 additions & 1 deletion tests/LuaJIT-test-cleanup/test/lib/index
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ base
bit.lua +bit
math
string
utf8.lua
table
coroutine
ffi +ffi
contents.lua
contents.lua
Loading