Implement __utf16__() and __utf32__() for the DB family

Implement __utf16__() and __utf32__() for the DB family of
pseudo-instructions.  Not yet implemented for evaluation context.
This commit is contained in:
H. Peter Anvin 2008-06-14 16:53:48 -07:00
parent dfaa278cd5
commit 518df30308
10 changed files with 327 additions and 112 deletions

View file

@ -67,8 +67,8 @@ NASM = nasm.$(O) nasmlib.$(O) raa.$(O) saa.$(O) \
output/outobj.$(O) output/outas86.$(O) output/outrdf2.$(O) \
output/outdbg.$(O) output/outieee.$(O) output/outmacho.$(O) \
preproc.$(O) quote.$(O) pptok.$(O) macros.$(O) \
listing.$(O) eval.$(O) exprlib.$(O) stdscan.$(O) tokhash.$(O) \
regvals.$(O) regflags.$(O)
listing.$(O) eval.$(O) exprlib.$(O) stdscan.$(O) strfunc.$(O) \
tokhash.$(O) regvals.$(O) regflags.$(O)
NDISASM = ndisasm.$(O) disasm.$(O) sync.$(O) nasmlib.$(O) \
insnsd.$(O) insnsb.$(O) insnsn.$(O) regs.$(O) regdis.$(O)
@ -234,7 +234,7 @@ alldeps: perlreq
#-- Everything below is generated by mkdep.pl - do not edit --#
assemble.$(O): assemble.c assemble.h compiler.h config.h insns.h insnsi.h \
nasm.h nasmlib.h regs.h tables.h tokens.h version.h
crc64.$(O): crc64.c compiler.h config.h
crc64.$(O): crc64.c compiler.h config.h nasmlib.h
disasm.$(O): disasm.c compiler.h config.h disasm.h insns.h insnsi.h nasm.h \
nasmlib.h regdis.h regs.h sync.h tables.h tokens.h version.h
eval.$(O): eval.c compiler.h config.h eval.h float.h insnsi.h labels.h \
@ -309,6 +309,8 @@ regvals.$(O): regvals.c compiler.h config.h insnsi.h tables.h
saa.$(O): saa.c compiler.h config.h nasmlib.h saa.h
stdscan.$(O): stdscan.c compiler.h config.h insns.h insnsi.h nasm.h \
nasmlib.h quote.h regs.h stdscan.h tokens.h version.h
strfunc.$(O): strfunc.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
version.h
sync.$(O): sync.c compiler.h config.h nasmlib.h sync.h
tokhash.$(O): tokhash.c compiler.h config.h hashtbl.h insns.h insnsi.h \
nasm.h nasmlib.h regs.h tokens.h version.h

View file

@ -180,7 +180,7 @@ everything: all doc rdf
#-- Everything below is generated by mkdep.pl - do not edit --#
assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h \
nasmlib.h regs.h tables.h tokens.h version.h
crc64.$(O): crc64.c compiler.h
crc64.$(O): crc64.c compiler.h nasmlib.h
disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h \
regdis.h regs.h sync.h tables.h tokens.h version.h
eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h \
@ -253,6 +253,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
saa.$(O): saa.c compiler.h nasmlib.h saa.h
stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h \
regs.h stdscan.h tokens.h version.h
strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h \
version.h
sync.$(O): sync.c compiler.h nasmlib.h sync.h
tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h \
nasmlib.h regs.h tokens.h version.h

View file

@ -120,7 +120,7 @@ $(OBJDIR)/version.inc: $(PROOT)/version $(PROOT)/version.pl $(OBJDIR)
#-- Everything below is generated by mkdep.pl - do not edit --#
assemble.o: assemble.c assemble.h compiler.h config.h insns.h insnsi.h \
nasm.h nasmlib.h regs.h tables.h tokens.h version.h
crc64.o: crc64.c compiler.h config.h
crc64.o: crc64.c compiler.h config.h nasmlib.h
disasm.o: disasm.c compiler.h config.h disasm.h insns.h insnsi.h nasm.h \
nasmlib.h regdis.h regs.h sync.h tables.h tokens.h version.h
eval.o: eval.c compiler.h config.h eval.h float.h insnsi.h labels.h nasm.h \
@ -193,6 +193,8 @@ regvals.o: regvals.c compiler.h config.h insnsi.h tables.h
saa.o: saa.c compiler.h config.h nasmlib.h saa.h
stdscan.o: stdscan.c compiler.h config.h insns.h insnsi.h nasm.h nasmlib.h \
quote.h regs.h stdscan.h tokens.h version.h
strfunc.o: strfunc.c compiler.h config.h insnsi.h nasm.h nasmlib.h regs.h \
version.h
sync.o: sync.c compiler.h config.h nasmlib.h sync.h
tokhash.o: tokhash.c compiler.h config.h hashtbl.h insns.h insnsi.h nasm.h \
nasmlib.h regs.h tokens.h version.h

View file

@ -209,7 +209,7 @@ everything: all doc rdf
#-- Everything below is generated by mkdep.pl - do not edit --#
assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h &
nasmlib.h regs.h tables.h tokens.h version.h
crc64.$(O): crc64.c compiler.h
crc64.$(O): crc64.c compiler.h nasmlib.h
disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h &
regdis.h regs.h sync.h tables.h tokens.h version.h
eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h &
@ -282,6 +282,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
saa.$(O): saa.c compiler.h nasmlib.h saa.h
stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h &
regs.h stdscan.h tokens.h version.h
strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h &
version.h
sync.$(O): sync.c compiler.h nasmlib.h sync.h
tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h &
nasmlib.h regs.h tokens.h version.h

View file

@ -219,7 +219,7 @@ everything: all doc rdf
#-- Everything below is generated by mkdep.pl - do not edit --#
assemble.$(O): assemble.c assemble.h compiler.h insns.h insnsi.h nasm.h \
nasmlib.h regs.h tables.h tokens.h version.h
crc64.$(O): crc64.c compiler.h
crc64.$(O): crc64.c compiler.h nasmlib.h
disasm.$(O): disasm.c compiler.h disasm.h insns.h insnsi.h nasm.h nasmlib.h \
regdis.h regs.h sync.h tables.h tokens.h version.h
eval.$(O): eval.c compiler.h eval.h float.h insnsi.h labels.h nasm.h \
@ -292,6 +292,8 @@ regvals.$(O): regvals.c compiler.h insnsi.h tables.h
saa.$(O): saa.c compiler.h nasmlib.h saa.h
stdscan.$(O): stdscan.c compiler.h insns.h insnsi.h nasm.h nasmlib.h quote.h \
regs.h stdscan.h tokens.h version.h
strfunc.$(O): strfunc.c compiler.h insnsi.h nasm.h nasmlib.h regs.h \
version.h
sync.$(O): sync.c compiler.h nasmlib.h sync.h
tokhash.$(O): tokhash.c compiler.h hashtbl.h insns.h insnsi.h nasm.h \
nasmlib.h regs.h tokens.h version.h

View file

@ -335,7 +335,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
out(offset, segment, &e->offset,
OUT_ADDRESS, wsize, e->segment, e->wrt);
offset += wsize;
} else if (e->type == EOT_DB_STRING) {
} else if (e->type == EOT_DB_STRING ||
e->type == EOT_DB_STRING_FREE) {
int align;
out(offset, segment, e->stringval,
@ -348,6 +349,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
OUT_RAWDATA, align, NO_SEG, NO_SEG);
}
offset += e->stringlen + align;
if (e->type == EOT_DB_STRING_FREE)
nasm_free(e->stringval);
}
}
if (t > 0 && t == instruction->times - 1) {
@ -365,15 +368,8 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
}
if (instruction->opcode == I_INCBIN) {
static char fname[FILENAME_MAX];
const char *fname = instruction->eops->stringval;
FILE *fp;
int32_t len;
len = FILENAME_MAX - 1;
if (len > instruction->eops->stringlen)
len = instruction->eops->stringlen;
strncpy(fname, instruction->eops->stringval, len);
fname[len] = '\0';
fp = fopen(fname, "rb");
if (!fp) {
@ -383,17 +379,18 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
error(ERR_NONFATAL, "`incbin': unable to seek on file `%s'",
fname);
} else {
static char buf[2048];
int32_t t = instruction->times;
int32_t base = 0;
static char buf[4096];
size_t t = instruction->times;
size_t base = 0;
size_t len;
len = ftell(fp);
if (instruction->eops->next) {
base = instruction->eops->next->offset;
len -= base;
if (instruction->eops->next->next &&
len > instruction->eops->next->next->offset)
len = instruction->eops->next->next->offset;
len > (size_t)instruction->eops->next->next->offset)
len = (size_t)instruction->eops->next->next->offset;
}
/*
* Dummy call to list->output to give the offset to the
@ -402,7 +399,7 @@ int64_t assemble(int32_t segment, int64_t offset, int bits, uint32_t cp,
list->output(offset, NULL, OUT_RAWDATA, 0);
list->uplevel(LIST_INCBIN);
while (t--) {
int32_t l;
size_t l;
fseek(fp, base, SEEK_SET);
l = len;
@ -660,7 +657,8 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
osize = 0;
if (e->type == EOT_DB_NUMBER)
osize = 1;
else if (e->type == EOT_DB_STRING)
else if (e->type == EOT_DB_STRING ||
e->type == EOT_DB_STRING_FREE)
osize = e->stringlen;
align = (-osize) % wsize;
@ -672,16 +670,10 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
}
if (instruction->opcode == I_INCBIN) {
char fname[FILENAME_MAX];
const char *fname = instruction->eops->stringval;
FILE *fp;
int32_t len;
size_t len;
len = FILENAME_MAX - 1;
if (len > instruction->eops->stringlen)
len = instruction->eops->stringlen;
strncpy(fname, instruction->eops->stringval, len);
fname[len] = '\0';
fp = fopen(fname, "rb");
if (!fp)
error(ERR_NONFATAL, "`incbin': unable to open file `%s'",
@ -695,8 +687,8 @@ int64_t insn_size(int32_t segment, int64_t offset, int bits, uint32_t cp,
if (instruction->eops->next) {
len -= instruction->eops->next->offset;
if (instruction->eops->next->next &&
len > instruction->eops->next->next->offset) {
len = instruction->eops->next->next->offset;
len > (size_t)instruction->eops->next->next->offset) {
len = (size_t)instruction->eops->next->next->offset;
}
}
return instruction->times * len;

26
nasm.h
View file

@ -182,6 +182,7 @@ enum token_type { /* token types, other than chars */
TOKEN_DBL_AND, TOKEN_DBL_OR, TOKEN_DBL_XOR, /* &&, || and ^^ */
TOKEN_SEG, TOKEN_WRT, /* SEG and WRT */
TOKEN_FLOATIZE, /* __floatX__ */
TOKEN_STRFUNC, /* __utf16__, __utf32__ */
};
enum floatize {
@ -195,6 +196,14 @@ enum floatize {
FLOAT_128H,
};
/* Must match the list in string_transform(), in strfunc.c */
enum strfunc {
STRFUNC_UTF16,
STRFUNC_UTF32,
};
size_t string_transform(char *, size_t, char **, enum strfunc);
/*
* The expression evaluator must be passed a scanner function; a
* standard scanner is provided as part of nasmlib.c. The
@ -605,11 +614,14 @@ enum prefixes { /* instruction prefixes */
PREFIX_ENUM_LIMIT
};
enum { /* extended operand types */
EOT_NOTHING, EOT_DB_STRING, EOT_DB_NUMBER
enum extop_type { /* extended operand types */
EOT_NOTHING,
EOT_DB_STRING, /* Byte string */
EOT_DB_STRING_FREE, /* Byte string which should be nasm_free'd*/
EOT_DB_NUMBER, /* Integer */
};
enum { /* special EA flags */
enum ea_flags { /* special EA flags */
EAF_BYTEOFFS = 1, /* force offset part to byte size */
EAF_WORDOFFS = 2, /* force offset part to [d]word size */
EAF_TIMESTWO = 4, /* really do EAX*2 not EAX+EAX */
@ -643,12 +655,12 @@ typedef struct operand { /* operand to an instruction */
typedef struct extop { /* extended operand */
struct extop *next; /* linked list */
int32_t type; /* defined above */
char *stringval; /* if it's a string, then here it is */
int stringlen; /* ... and here's how long it is */
int32_t segment; /* if it's a number/address, then... */
char *stringval; /* if it's a string, then here it is */
size_t stringlen; /* ... and here's how long it is */
int64_t offset; /* ... it's given here ... */
int32_t segment; /* if it's a number/address, then... */
int32_t wrt; /* ... and here */
enum extop_type type; /* defined above */
} extop;
/* Prefix positions: each type of prefix goes in a specific slot.

174
parser.c
View file

@ -334,6 +334,7 @@ restart_parse:
result->opcode == I_DY || result->opcode == I_INCBIN) {
extop *eop, **tail = &result->eops, **fixptr;
int oper_num = 0;
int32_t sign;
result->eops_float = false;
@ -355,85 +356,114 @@ restart_parse:
eop->next = NULL;
eop->type = EOT_NOTHING;
oper_num++;
sign = +1;
/* is_comma_next() here is to distinguish this from
a string used as part of an expression... */
if (i == TOKEN_STR && is_comma_next()) {
eop->type = EOT_DB_STRING;
eop->stringval = tokval.t_charptr;
eop->stringlen = tokval.t_inttwo;
i = stdscan(NULL, &tokval); /* eat the comma */
continue;
}
if ((i == TOKEN_FLOAT && is_comma_next())
|| i == '-' || i == '+') {
int32_t sign = +1;
if (i == '+' || i == '-') {
char *save = stdscan_bufptr;
int token = i;
sign = (i == '-') ? -1 : 1;
i = stdscan(NULL, &tokval);
if (i != TOKEN_FLOAT || !is_comma_next()) {
stdscan_bufptr = save;
i = tokval.t_type = token;
}
}
if (i == TOKEN_FLOAT) {
eop->type = EOT_DB_STRING;
result->eops_float = true;
switch (result->opcode) {
case I_DB:
eop->stringlen = 1;
break;
case I_DW:
eop->stringlen = 2;
break;
case I_DD:
eop->stringlen = 4;
break;
case I_DQ:
eop->stringlen = 8;
break;
case I_DT:
eop->stringlen = 10;
break;
case I_DO:
eop->stringlen = 16;
break;
case I_DY:
error(ERR_NONFATAL, "floating-point constant"
" encountered in DY instruction");
eop->stringlen = 0;
break;
default:
error(ERR_NONFATAL, "floating-point constant"
" encountered in unknown instruction");
/*
* fix suggested by Pedro Gimeno... original line
* was:
* eop->type = EOT_NOTHING;
*/
eop->stringlen = 0;
break;
}
eop = nasm_realloc(eop, sizeof(extop) + eop->stringlen);
tail = &eop->next;
*fixptr = eop;
eop->stringval = (char *)eop + sizeof(extop);
if (!eop->stringlen ||
!float_const(tokval.t_charptr, sign,
(uint8_t *)eop->stringval,
eop->stringlen, error))
eop->type = EOT_NOTHING;
i = stdscan(NULL, &tokval); /* eat the comma */
continue;
}
}
/* anything else */
{
} else if (i == TOKEN_STRFUNC) {
bool parens = false;
const char *funcname = tokval.t_charptr;
enum strfunc func = tokval.t_integer;
i = stdscan(NULL, &tokval);
if (i == '(') {
parens = true;
i = stdscan(NULL, &tokval);
}
if (i != TOKEN_STR) {
error(ERR_NONFATAL,
"%s must be followed by a string constant",
funcname);
eop->type = EOT_NOTHING;
} else {
eop->type = EOT_DB_STRING_FREE;
eop->stringlen =
string_transform(tokval.t_charptr, tokval.t_inttwo,
&eop->stringval, func);
if (eop->stringlen == (size_t)-1) {
error(ERR_NONFATAL, "invalid string for transform");
eop->type = EOT_NOTHING;
}
}
if (parens && i && i != ')') {
i = stdscan(NULL, &tokval);
if (i != ')') {
error(ERR_NONFATAL, "unterminated %s function",
funcname);
}
}
if (i && i != ',')
i = stdscan(NULL, &tokval);
} else if (i == '-' || i == '+') {
char *save = stdscan_bufptr;
int token = i;
sign = (i == '-') ? -1 : 1;
i = stdscan(NULL, &tokval);
if (i != TOKEN_FLOAT) {
stdscan_bufptr = save;
i = tokval.t_type = token;
goto is_expression;
} else {
goto is_float;
}
} else if (i == TOKEN_FLOAT) {
is_float:
eop->type = EOT_DB_STRING;
result->eops_float = true;
switch (result->opcode) {
case I_DB:
eop->stringlen = 1;
break;
case I_DW:
eop->stringlen = 2;
break;
case I_DD:
eop->stringlen = 4;
break;
case I_DQ:
eop->stringlen = 8;
break;
case I_DT:
eop->stringlen = 10;
break;
case I_DO:
eop->stringlen = 16;
break;
case I_DY:
error(ERR_NONFATAL, "floating-point constant"
" encountered in DY instruction");
eop->stringlen = 0;
break;
default:
error(ERR_NONFATAL, "floating-point constant"
" encountered in unknown instruction");
/*
* fix suggested by Pedro Gimeno... original line
* was:
* eop->type = EOT_NOTHING;
*/
eop->stringlen = 0;
break;
}
eop = nasm_realloc(eop, sizeof(extop) + eop->stringlen);
tail = &eop->next;
*fixptr = eop;
eop->stringval = (char *)eop + sizeof(extop);
if (!eop->stringlen ||
!float_const(tokval.t_charptr, sign,
(uint8_t *)eop->stringval,
eop->stringlen, error))
eop->type = EOT_NOTHING;
i = stdscan(NULL, &tokval); /* eat the comma */
} else {
/* anything else, assume it is an expression */
expr *value;
is_expression:
value = evaluate(stdscan, NULL, &tokval, NULL,
critical, error, NULL);
i = tokval.t_type;

167
strfunc.c Normal file
View file

@ -0,0 +1,167 @@
/*
* strfunc.c
*
* String transformation functions
*/
#include "nasmlib.h"
#include "nasm.h"
/*
* Convert a string in UTF-8 format to UTF-16LE
*/
static size_t utf8_to_16le(uint8_t *str, size_t len, char *op)
{
#define EMIT(x) do { if (op) { WRITESHORT(op,x); } outlen++; } while(0)
size_t outlen = 0;
int expect = 0;
uint8_t c;
uint32_t v = 0, vmin = 0;
while (len--) {
c = *str++;
if (expect) {
if ((c & 0xc0) != 0x80) {
expect = 0;
return -1;
} else {
v = (v << 6) | (c & 0x3f);
if (!--expect) {
if (v < vmin || v > 0x10ffff ||
(v >= 0xd800 && v <= 0xdfff)) {
return -1;
} else if (v > 0xffff) {
v -= 0x10000;
EMIT(0xd800 | (v >> 10));
EMIT(0xdc00 | (v & 0x3ff));
} else {
EMIT(v);
}
}
continue;
}
}
if (c < 0x80) {
EMIT(c);
} else if (c < 0xa0 || c >= 0xfe) {
/* Invalid UTF-8 */
return -1;
} else if (c < 0xe0) {
v = c & 0x1f;
expect = 1;
vmin = 0x80;
} else if (c < 0xf0) {
v = c & 0x0f;
expect = 2;
vmin = 0x800;
} else if (c < 0xf8) {
v = c & 0x07;
expect = 3;
vmin = 0x10000;
} else if (c < 0xfc) {
v = c & 0x03;
expect = 4;
vmin = 0x200000;
} else {
v = c & 0x01;
expect = 5;
vmin = 0x4000000;
}
}
return expect ? (size_t)-1 : outlen << 1;
#undef EMIT
}
/*
* Convert a string in UTF-8 format to UTF-32LE
*/
static size_t utf8_to_32le(uint8_t *str, size_t len, char *op)
{
#define EMIT(x) do { if (op) { WRITELONG(op,x); } outlen++; } while(0)
size_t outlen = 0;
int expect = 0;
uint8_t c;
uint32_t v = 0, vmin = 0;
while (len--) {
c = *str++;
if (expect) {
if ((c & 0xc0) != 0x80) {
return -1;
} else {
v = (v << 6) | (c & 0x3f);
if (!--expect) {
if (v < vmin || (v >= 0xd800 && v <= 0xdfff)) {
return -1;
} else {
EMIT(v);
}
}
continue;
}
}
if (c < 0x80) {
EMIT(c);
} else if (c < 0xa0 || c >= 0xfe) {
/* Invalid UTF-8 */
return -1;
} else if (c < 0xe0) {
v = c & 0x1f;
expect = 1;
vmin = 0x80;
} else if (c < 0xf0) {
v = c & 0x0f;
expect = 2;
vmin = 0x800;
} else if (c < 0xf8) {
v = c & 0x07;
expect = 3;
vmin = 0x10000;
} else if (c < 0xfc) {
v = c & 0x03;
expect = 4;
vmin = 0x200000;
} else {
v = c & 0x01;
expect = 5;
vmin = 0x4000000;
}
}
return expect ? (size_t)-1 : outlen << 2;
#undef EMIT
}
typedef size_t (*transform_func)(uint8_t *, size_t, char *);
/*
* Apply a specific string transform and return it in a nasm_malloc'd
* buffer, returning the length. On error, returns (size_t)-1 and no
* buffer is allocated.
*/
size_t string_transform(char *str, size_t len, char **out, enum strfunc func)
{
/* This should match enum strfunc in nasm.h */
static const transform_func str_transforms[] = {
utf8_to_16le,
utf8_to_32le,
};
transform_func transform = str_transforms[func];
size_t outlen;
uint8_t *s = (uint8_t *)str;
outlen = transform(s, len, NULL);
if (outlen == (size_t)-1)
return -1;
return transform(s, len, *out = nasm_malloc(outlen));
}

View file

@ -53,6 +53,10 @@ __float80e__
__float128l__
__float128h__
% TOKEN_STRFUNC, 0, STRFUNC_{__*__}
__utf16__
__utf32__
% TOKEN_*, 0, 0
seg
wrt