jwasm/tokenize.c
2014-06-18 19:16:56 +04:00

1088 lines
38 KiB
C

/****************************************************************************
*
* Open Watcom Project
*
* Portions Copyright (c) 1983-2002 Sybase, Inc. All Rights Reserved.
*
* ========================================================================
*
* This file contains Original Code and/or Modifications of Original
* Code as defined in and that are subject to the Sybase Open Watcom
* Public License version 1.0 (the 'License'). You may not use this file
* except in compliance with the License. BY USING THIS FILE YOU AGREE TO
* ALL TERMS AND CONDITIONS OF THE LICENSE. A copy of the License is
* provided with the Original Code and Modifications, and is also
* available at www.sybase.com/developer/opensource.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND SYBASE AND ALL CONTRIBUTORS HEREBY DISCLAIM
* ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR
* NON-INFRINGEMENT. Please see the License for the specific language
* governing rights and limitations under the License.
*
* ========================================================================
*
* Description: tokenizer.
*
****************************************************************************/
#include <ctype.h>
#include "globals.h"
#include "memalloc.h"
#include "parser.h"
#include "condasm.h"
#include "reswords.h"
#include "input.h"
#include "segment.h"
#include "listing.h"
#include "tokenize.h"
#include "fastpass.h"
#include "myassert.h"
#define CONCATID 0 /* 0=most compatible (see backsl.asm) */
#define MASMNUMBER 1 /* 1=Masm-compatible number scanning */
#ifdef __I86__
#define TOKSTRALIGN 0 /* 0=don't align token strings */
#else
#define TOKSTRALIGN 1 /* 1=align token strings to sizeof(uint_32) */
#endif
#ifndef DOTNAMEX /* v2.08: added */
/* set DOTNAMEX to 1 if support for Intel C++ generated assembly code
* is to be enabled.
*/
#define DOTNAMEX 0
#endif
extern struct ReservedWord ResWordTable[];
#ifdef DEBUG_OUT
int_32 cnttok0;
int_32 cnttok1;
extern struct asm_tok *end_tokenarray;
extern char *end_stringbuf;
#endif
extern char *token_stringbuf; /* start token string buffer */
extern char *commentbuffer;
/* v2.08: moved to struct line_status */
//static uint_8 g_flags; /* directive flags for current line */
#if !defined(__GNUC__) && !defined(__POCC__)
#define tolower(c) ((c >= 'A' && c <= 'Z') ? c | 0x20 : c )
#endif
/* strings for token 0x28 - 0x2F */
static const short stokstr1[] = {
'(',')','*','+',',','-','.','/'};
/* strings for token 0x5B - 0x5D */
static const short stokstr2[] = {
'[',0,']'};
/* test line concatenation if last token is a comma.
* dont concat EQU, macro invocations or
* - ECHO
* - FORC/IRPC (v2.0)
* - INCLUDE (v2.8)
* lines!
* v2.05: don't concat if line's an instruction.
*/
static bool IsMultiLine( struct asm_tok tokenarray[] )
/****************************************************/
{
struct asym *sym;
int i;
if ( tokenarray[1].token == T_DIRECTIVE && tokenarray[1].tokval == T_EQU )
return( FALSE );
i = ( tokenarray[1].token == T_COLON ? 2 : 0 );
/* don't concat macros */
if ( tokenarray[i].token == T_ID ) {
sym = SymSearch( tokenarray[i].string_ptr );
if ( sym && ( sym->state == SYM_MACRO )
#if VARARGML
&& sym->mac_multiline == FALSE /* v2.11: added */
#endif
)
return( FALSE );
} else if ( tokenarray[i].token == T_INSTRUCTION ||
( tokenarray[i].token == T_DIRECTIVE &&
( tokenarray[i].tokval == T_ECHO ||
tokenarray[i].tokval == T_INCLUDE ||
tokenarray[i].tokval == T_FORC ||
tokenarray[i].tokval == T_IRPC ) ) ) {
return( FALSE );
}
return( TRUE );
}
static ret_code get_float( struct asm_tok *buf, struct line_status *p )
/*********************************************************************/
{
/* valid floats look like: (int)[.(int)][e(int)]
* Masm also allows hex format, terminated by 'r' (3F800000r)
*/
char got_decimal = FALSE;
char got_e = FALSE;
char *ptr = p->input;
for( ; *ptr != NULLC; ptr++ ) {
char c = *ptr;
if( isdigit( c ) ) {
;
} else if ( c == '.' && got_decimal == FALSE ) {
got_decimal = TRUE;
} else if ( tolower( c ) == 'e' && got_e == FALSE ) {
got_e = TRUE;
/* accept e+2 / e-4 /etc. */
if ( *(ptr+1) == '+' || *(ptr+1) == '-' )
ptr++;
/* it's accepted if there's no digit behind 'e' */
//if ( !isdigit( *(ptr+1) ) )
// break;
} else
break;
}
buf->token = T_FLOAT;
buf->floattype = NULLC;
memcpy( p->output, p->input, ptr - p->input );
p->output += ( ptr - p->input );
*p->output++ = NULLC;
p->input = ptr;
/* the binary value isn't used currently */
//*((float *)(&buf->value)) = atof( buf->string_ptr );
return( NOT_ERROR );
}
static ret_code ConcatLine( char *src, int cnt, char *out, struct line_status *ls )
/*********************************************************************************/
{
char *p = src+1;
int max;
while ( isspace(*p) ) p++;
if ( *p == NULLC || *p == ';' ) {
//char *buffer = GetAlignedPointer( out, strlen( out ) );
char *buffer = out;
if( GetTextLine( buffer ) ) {
p = buffer;
/* skip leading spaces */
while ( isspace( *p ) ) p++;
max = strlen( p );
if ( cnt == 0 )
*src++ = ' ';
if ( ( src - ls->start ) + max >= MAX_LINE_LEN ) {
EmitError( LINE_TOO_LONG );
max = MAX_LINE_LEN - ( src - ls->start + 1 );
*(p+max) = NULLC;
}
memcpy( src, p, max+1 );
return( NOT_ERROR );
}
}
return( EMPTY );
}
static ret_code get_string( struct asm_tok *buf, struct line_status *p )
/**********************************************************************/
{
char symbol_o;
char symbol_c;
char c;
char *src = p->input;
char *dst = p->output;
int count = 0;
int level;
symbol_o = *src;
switch( symbol_o ) {
case '"':
case '\'':
buf->string_delim = symbol_o;
*dst++ = symbol_o;
src++;
for ( ; count < MAX_STRING_LEN; src++, count++ ) {
c = *src;
if( c == symbol_o ) { /* another quote? */
*dst++ = c; /* store it */
src++;
if( *src != c )
break; /* exit loop */
/* a pair of quotes inside the string is
* handled as a single quote */
} else if( c == NULLC ) {
/* missing terminating quote, change to undelimited string */
buf->string_delim = NULLC;
count++; /* count the first quote */
break;
} else {
*dst++ = c;
}
}
break; /* end of string marker is the same */
case '{':
if ( p->flags & TOK_NOCURLBRACES )
goto undelimited_string;
case '<':
buf->string_delim = symbol_o;
symbol_c = ( symbol_o == '<' ? '>' : '}' );
src++;
for( level = 0; count < MAX_STRING_LEN; ) {
c = *src;
if( c == symbol_o ) { /* < or { ? */
level++;
*dst++ = c; src++;
count++;
} else if( c == symbol_c ) { /* > or }? */
if( level ) {
level--;
*dst++ = c; src++;
count++;
} else {
/* store the string delimiter unless it is <> */
/* v2.08: don't store delimiters for {}-literals */
//if (symbol_o != '<')
// *dst++ = c;
src++;
break; /* exit loop */
}
#if 1
/*
a " or ' inside a <>/{} string? Since it's not a must that
[double-]quotes are paired in a literal it must be done
directive-dependant!
see: IFIDN <">,<">
*/
} else if( ( c == '"' || c == '\'' ) && ( p->flags2 & DF_STRPARM ) == 0 ) {
char delim = c;
char *tdst;
char *tsrc;
int tcount;
*dst++ = c; src++;
count++;
tdst = dst;
tsrc = src;
tcount = count;
while (*src != delim && *src != NULLC && count < MAX_STRING_LEN-1 ) {
if ( symbol_o == '<' && *src == '!' && *(src+1) != NULLC )
src++;
*dst++ = *src++;
count++;
}
if ( *src == delim ) {
*dst++ = *src++;
count++;
continue;
} else {
/* restore values */
src = tsrc;
dst = tdst;
count = tcount;
}
#endif
} else if( c == '!' && symbol_o == '<' && *(src+1) ) {
/* handle literal-character operator '!'.
* it makes the next char to enter the literal uninterpreted.
*/
/* v2.09: don't store the '!' */
//*dst++ = c; src++;
//count++;
//if ( count == MAX_STRING_LEN )
// break;
src++;
*dst++ = *src++;
count++;
} else if( c == '\\' && ConcatLine( src, count, dst, p ) != EMPTY ) {
p->flags3 |= TF3_ISCONCAT;
} else if( c == NULLC || ( c == ';' && symbol_o == '{' )) {
if ( p->flags == TOK_DEFAULT && (( p->flags2 & DF_NOCONCAT ) == 0 ) ) { /* <{ */
/* if last nonspace character was a comma
* get next line and continue string scan
*/
char *tmp = dst-1;
while ( isspace(*tmp) ) tmp--;
if ( *tmp == ',' ) {
DebugMsg1(("Tokenize.get_string: comma concatenation: %s\n", src ));
tmp = GetAlignedPointer( p->output, strlen( p->output ) );
if( GetTextLine( tmp ) ) {
/* skip leading spaces */
while ( isspace( *tmp ) ) tmp++;
/* this size check isn't fool-proved yet */
if ( strlen( tmp ) + count >= MAX_LINE_LEN ) {
EmitError( LINE_TOO_LONG );
return( ERROR );
}
strcpy( src, tmp );
continue;
}
}
}
src = p->input;
dst = p->output;
*dst++ = *src++;
count = 1;
goto undelimited_string;
} else {
*dst++ = c; src++;
count++;
}
}
break;
default:
undelimited_string:
buf->string_delim = NULLC;
/* this is an undelimited string,
* so just copy it until we hit something that looks like the end.
* this format is used by the INCLUDE directive, but may also
* occur inside the string macros!
*/
/* v2.05: also stop if a ')' is found - see literal2.asm regression test */
//for( count = 0 ; count < MAX_STRING_LEN && *src != NULLC && !isspace( *src ) && *src != ',' && *src != ';'; ) {
for( ; count < MAX_STRING_LEN &&
/* v2.08: stop also at < and % */
//*src != NULLC && !isspace( *src ) && *src != ',' && *src != ';' && *src != ')'; ) {
//*src && !isspace( *src ) && *src != ',' && *src != ')' && *src != '<' && *src != '%'; ) {
*src && !isspace( *src ) && *src != ',' && *src != ')' && *src != '%'; ) {
if ( *src == ';' && p->flags == TOK_DEFAULT )
break;
/* v2.11: handle '\' also for expanded lines */
//if ( *src == '\\' && !( p->flags & TOK_NOCURLBRACES ) ) {
if ( *src == '\\' && ( p->flags == TOK_DEFAULT || ( p->flags & TOK_LINE ) ) ) {
if ( ConcatLine( src, count, dst, p ) != EMPTY ) {
DebugMsg1(("Tokenize.get_string: backslash concatenation: >%s<\n", src ));
p->flags3 |= TF3_ISCONCAT;
if ( count )
continue;
return( EMPTY );
}
}
/* v2.08: handle '!' operator */
if ( *src == '!' && *(src+1) && count < MAX_STRING_LEN - 1 )
*dst++ = *src++;
*dst++ = *src++;
count++;
}
break;
}
if ( count == MAX_STRING_LEN ) {
EmitError( STRING_OR_TEXT_LITERAL_TOO_LONG );
return( ERROR );
}
*dst++ = NULLC;
buf->token = T_STRING;
buf->stringlen = count;
p->input = src;
p->output = dst;
return( NOT_ERROR );
}
static ret_code get_special_symbol( struct asm_tok *buf, struct line_status *p )
/******************************************************************************/
{
char symbol;
//int i;
symbol = *p->input;
switch( symbol ) {
case ':' : /* T_COLON binary operator (0x3A) */
p->input++;
if ( *p->input == ':' ) {
p->input++;
buf->token = T_DBL_COLON;
buf->string_ptr = "::";
} else {
buf->token = T_COLON;
buf->string_ptr = ":";
}
break;
case '%' : /* T_PERCENT (0x25) */
#if PERCENT_OUT
/* %OUT directive? */
if ( ( _memicmp( p->input+1, "OUT", 3 ) == 0 ) && !is_valid_id_char( *(p->input+4) ) ) {
buf->token = T_DIRECTIVE;
buf->tokval = T_ECHO;
buf->dirtype = DRT_ECHO;
memcpy( p->output, p->input, 4 );
p->input += 4;
p->output += 4;
*(p->output)++ = NULLC;
break;
}
#endif
p->input++;
if ( p->flags == TOK_DEFAULT && p->index == 0 ) {
p->flags3 |= TF3_EXPANSION;
return( EMPTY );
}
buf->token = T_PERCENT;
buf->string_ptr = "%";
break;
case '(' : /* 0x28: T_OP_BRACKET operator - needs a matching ')' */
/* v2.11: reset c-expression flag if a macro function call is detected */
if ( ( p->flags2 & DF_CEXPR ) && p->index && (buf-1)->token == T_ID ) {
struct asym *sym = SymSearch( (buf-1)->string_ptr );
if ( sym && ( sym->state == SYM_MACRO ) && sym->isfunc )
p->flags2 &= ~DF_CEXPR;
}
/* no break */
case ')' : /* 0x29: T_CL_BRACKET */
case '*' : /* 0x2A: binary operator */
case '+' : /* 0x2B: unary|binary operator */
case ',' : /* 0x2C: T_COMMA */
case '-' : /* 0x2D: unary|binary operator */
case '.' : /* 0x2E: T_DOT binary operator */
case '/' : /* 0x2F: binary operator */
/* all of these are themselves a token */
p->input++;
buf->token = symbol;
buf->specval = 0; /* initialize, in case the token needs extra data */
/* v2.06: use constants for the token string */
buf->string_ptr = (char *)&stokstr1[symbol - '('];
break;
case '[' : /* T_OP_SQ_BRACKET operator - needs a matching ']' (0x5B) */
case ']' : /* T_CL_SQ_BRACKET (0x5D) */
p->input++;
buf->token = symbol;
/* v2.06: use constants for the token string */
buf->string_ptr = (char *)&stokstr2[symbol - '['];
break;
case '=' : /* (0x3D) */
if ( *(p->input+1) != '=' ) {
buf->token = T_DIRECTIVE;
buf->tokval = T_EQU;
buf->dirtype = DRT_EQUALSGN; /* to make it differ from EQU directive */
buf->string_ptr = "=";
p->input++;
break;
}
/* fall through */
default:
/* detect C style operators.
* DF_CEXPR is set if .IF, .WHILE, .ELSEIF or .UNTIL
* has been detected in the current line.
* will catch: '!', '<', '>', '&', '==', '!=', '<=', '>=', '&&', '||'
* A single '|' will also be caught, although it isn't a valid
* operator - it will cause a 'operator expected' error msg later.
* the tokens are stored as one- or two-byte sized "strings".
*/
if ( ( p->flags2 & DF_CEXPR ) && strchr( "=!<>&|", symbol ) ) {
*(p->output)++ = symbol;
p->input++;
buf->stringlen = 1;
if ( symbol == '&' || symbol == '|' ) {
if ( *p->input == symbol ) {
*(p->output)++ = symbol;
p->input++;
buf->stringlen = 2;
}
} else if ( *p->input == '=' ) {
*(p->output)++ = '=';
p->input++;
buf->stringlen = 2;
}
buf->token = T_STRING;
buf->string_delim = NULLC;
*(p->output)++ = NULLC;
break;
}
/* v2.08: ampersand is a special token */
if ( symbol == '&' ) {
p->input++;
buf->token = '&';
buf->string_ptr = "&";
break;
}
/* anything we don't recognise we will consider a string,
* delimited by space characters, commas, newlines or nulls
*/
return( get_string( buf, p ) );
}
return( NOT_ERROR );
}
#if 0
static void array_mul_add( unsigned char *buf, unsigned base, unsigned num, unsigned size )
/*****************************************************************************************/
{
while( size-- > 0 ) {
num += *buf * base;
*(buf++) = num;
num >>= 8;
}
}
#endif
/* read in a number.
* check the number suffix:
* b or y: base 2
* d or t: base 10
* h: base 16
* o or q: base 8
*/
static ret_code get_number( struct asm_tok *buf, struct line_status *p )
/**********************************************************************/
{
char *ptr = p->input;
char *dig_start;
char *dig_end;
unsigned base = 0;
unsigned len;
uint_32 digits_seen;
char last_char;
#define VALID_BINARY 0x0003
#define VALID_OCTAL 0x00ff
#define VALID_DECIMAL 0x03ff
#define OK_NUM( t ) ((digits_seen & ~VALID_##t) == 0)
digits_seen = 0;
#if CHEXPREFIX
if( *ptr == '0' && (tolower( *(ptr+1) ) == 'x' ) ) {
ptr += 2;
base = 16;
}
#endif
dig_start = ptr;
for( ;; ptr++ ) {
if (*ptr >= '0' && *ptr <= '9')
digits_seen |= 1 << (*ptr - '0');
else {
last_char = tolower( *ptr );
if ( last_char >= 'a' && last_char <= 'f' )
digits_seen |= 1 << ( last_char + 10 - 'a' );
else
break;
}
}
/* note that a float MUST contain a dot.
* 1234e78 is NOT a valid float
*/
if ( last_char == '.' )
return( get_float( buf, p ) );
#if 0
/* v2.08: if suffix isn't followed by a non-id char, don't use it! */
if ( last_char && is_valid_id_char( *(ptr+1) ) ) {
last_char = NULLC;
while ( *(ptr-1) > '9' )
ptr--;
digits_seen &= 0x3FF;
}
#endif
#if CHEXPREFIX
if ( base != 0 ) {
dig_end = ptr;
if ( digits_seen == 0 )
base = 0;
} else
#endif
switch( last_char ) {
case 'r': /* a float with the "real number designator" */
buf->token = T_FLOAT;
buf->floattype = 'r';
ptr++;
goto number_done;
case 'h':
base = 16;
dig_end = ptr;
ptr++;
break;
//case 'b':
case 'y':
if( OK_NUM( BINARY ) ) {
base = 2;
dig_end = ptr;
ptr++;
}
break;
//case 'd':
case 't':
if( OK_NUM( DECIMAL ) ) {
base = 10;
dig_end = ptr;
ptr++;
}
break;
case 'q':
case 'o':
if( OK_NUM( OCTAL ) ) {
base = 8;
dig_end = ptr;
ptr++;
}
break;
default:
last_char = tolower( *(ptr-1) );
if ( ( last_char == 'b' || last_char == 'd' ) && digits_seen >= ( 1UL << ModuleInfo.radix ) ) {
char *tmp = dig_start;
char max = ( last_char == 'b' ? '1' : '9' );
for ( dig_end = ptr-1; tmp < dig_end && *tmp <= max; tmp++ );
if ( tmp == dig_end ) {
base = ( last_char == 'b' ? 2 : 10 );
break;
}
}
dig_end = ptr;
#if COCTALS
if( Options.allow_c_octals && *dig_start == '0' ) {
if( OK_NUM( OCTAL ) ) {
base = 8;
break;
}
}
#endif
/* radix max. digits_seen
-----------------------------------------------------------
2 3 2^2-1 (0,1)
8 255 2^8-1 (0,1,2,3,4,5,6,7)
10 1023 2^10-1 (0,1,2,3,4,5,6,7,8,9)
16 65535 2^16-1 (0,1,2,3,4,5,6,7,8,9,a,b,c,d,e,f)
*/
if ( digits_seen < (1UL << ModuleInfo.radix) )
base = ModuleInfo.radix;
break;
}
#if MASMNUMBER
/* Masm doesn't swallow alphanum chars which may follow the
* number!
*/
if ( base != 0 ) {
#else
if ( base != 0 && is_valid_id_char( *ptr ) == FALSE ) {
#endif
buf->token = T_NUM;
buf->numbase = base;
buf->itemlen = dig_end - dig_start;
//DebugMsg(("get_number: inp=%s, value=%" I32_SPEC "X\n", p->input, buf->value64 ));
} else {
buf->token = T_BAD_NUM;
DebugMsg(("get_number: BAD_NUMBER (%s), radix=%u, base=%u, ptr=>%s<, digits_seen=%Xh\n", dig_start, ModuleInfo.radix, base, ptr, digits_seen ));
/* swallow remainder of token */
while( is_valid_id_char( *ptr ) ) ++ptr;
}
number_done:
len = ptr - p->input;
memcpy( p->output, p->input, len );
p->output += len;
*p->output++ = NULLC;
p->input = ptr;
return( NOT_ERROR );
}
#if BACKQUOTES
static ret_code get_id_in_backquotes( struct asm_tok *buf, struct line_status *p )
/********************************************************************************/
{
char *optr = p->output;
buf->token = T_ID;
buf->idarg = 0;
p->input++; /* strip off the backquotes */
for( ; *p->input != '`'; ) {
if( *p->input == NULLC || *p->input == ';' ) {
*p->output = NULLC;
EmitErr( BACKQUOTE_MISSING, p->output );
return( ERROR );
}
*optr++ = *p->input++;
}
p->input++; /* skip the terminating '`' */
*optr++ = NULLC;
p->output = optr;
return( NOT_ERROR );
}
#endif
/* get an ID. will always return NOT_ERROR. */
static ret_code get_id( struct asm_tok *buf, struct line_status *p )
/******************************************************************/
{
//struct ReservedWord *resw;
char *src = p->input;
char *dst = p->output;
int index;
unsigned size;
#if CONCATID || DOTNAMEX
continue_scan:
#endif
do {
*dst++ = *src++;
} while ( is_valid_id_char( *src ) );
#if CONCATID
/* v2.05: in case there's a backslash right behind
* the ID, check if a line concatenation is to occur.
* If yes, and the first char of the concatenated line
* is also a valid ID char, continue to scan the name.
* Problem: it's ok for EQU, but less good for other directives.
*/
if ( *src == '\\' ) {
if ( ConcatLine( src, src - p->input, dst, p ) != EMPTY ) {
p->concat = TRUE;
if ( is_valid_id_char( *src ) )
goto continue_scan;
}
}
#endif
#if DOTNAMEX
/* if the name starts with a dot or underscore, then accept dots
* within the name (though not as last char). OPTION DOTNAME
* must be on.
*/
if ( *src == '.' && ModuleInfo.dotname &&
( *(p->output) == '.' || *(p->output) == '_' ) &&
( is_valid_id_char(*(src+1)) || *(src+1) == '.' ) )
goto continue_scan;
#endif
/* v2.04: check added */
size = dst - p->output;
if ( size > MAX_ID_LEN ) {
EmitErr( IDENTIFIER_TOO_LONG );
dst = p->output + MAX_ID_LEN;
}
*dst++ = NULLC;
/* now decide what to do with it */
if( size == 1 && *p->output == '?' ) {
p->input = src;
buf->token = T_QUESTION_MARK;
buf->string_ptr = "?";
return( NOT_ERROR );
}
index = FindResWord( p->output, size );
if( index == 0 ) {
/* if ID begins with a DOT, check for OPTION DOTNAME.
* if not set, skip the token and return a T_DOT instead!
*/
if ( *p->output == '.' && ModuleInfo.dotname == FALSE ) {
buf->token = T_DOT;
buf->string_ptr = (char *)&stokstr1['.' - '('];
p->input++;
return( NOT_ERROR );
}
p->input = src;
p->output = dst;
buf->token = T_ID;
buf->idarg = 0;
return( NOT_ERROR );
}
p->input = src;
p->output = dst;
buf->tokval = index; /* is a enum instr_token value */
/* v2.11: RWF_SPECIAL now obsolete */
//if ( ! ( ResWordTable[index].flags & RWF_SPECIAL ) ) {
if ( index >= SPECIAL_LAST ) {
// DebugMsg(("found item >%s< in instruction table, rm=%X\n", buf->string_ptr, InstrTable[index].rm_byte));
/* if -Zm is set, the following from the Masm docs is relevant:
*
* Reserved Keywords Dependent on CPU Mode with OPTION M510
*
* With OPTION M510, keywords and instructions not available in the
* current CPU mode (such as ENTER under .8086) are not treated as
* keywords. This also means the USE32, FLAT, FAR32, and NEAR32 segment
* types and the 80386/486 registers are not keywords with a processor
* selection less than .386.
* If you remove OPTION M510, any reserved word used as an identifier
* generates a syntax error. You can either rename the identifiers or
* use OPTION NOKEYWORD. For more information on OPTION NOKEYWORD, see
* OPTION NOKEYWORD, later in this appendix.
*
* The current implementation of this rule below is likely to be improved.
*/
if ( ModuleInfo.m510 ) {
/* checking the cpu won't give the expected results currently since
* some instructions in the table (i.e. MOV) start with a 386 variant!
*/
index = IndexFromToken( buf->tokval );
#if 0 /* changed for v1.96 */
if (( InstrTable[index].cpu & P_EXT_MASK ) > ( ModuleInfo.curr_cpu & P_EXT_MASK )) {
#else
if (( InstrTable[index].cpu & P_CPU_MASK ) > ( ModuleInfo.curr_cpu & P_CPU_MASK ) ||
( InstrTable[index].cpu & P_EXT_MASK ) > ( ModuleInfo.curr_cpu & P_EXT_MASK )) {
#endif
buf->token = T_ID;
buf->idarg = 0;
return( NOT_ERROR );
}
}
buf->token = T_INSTRUCTION;
return( NOT_ERROR );
}
index = buf->tokval;
/* for RWT_SPECIAL, field <bytval> contains further infos:
- RWT_REG: register number (regnum)
- RWT_DIRECTIVE: type of directive (dirtype)
- RWT_UNARY_OPERATOR: operator precedence
- RWT_BINARY_OPERATOR: operator precedence
- RWT_STYPE: memtype
- RWT_RES_ID: for languages, LANG_xxx value
for the rest, unused.
*/
buf->bytval = SpecialTable[index].bytval;
switch ( SpecialTable[index].type ) {
case RWT_REG:
buf->token = T_REG;
break;
case RWT_DIRECTIVE:
buf->token = T_DIRECTIVE;
if ( p->flags2 == 0 )
p->flags2 = SpecialTable[index].value;
break;
case RWT_UNARY_OP: /* OFFSET, LOW, HIGH, LOWWORD, HIGHWORD, SHORT, ... */
buf->token = T_UNARY_OPERATOR;
break;
case RWT_BINARY_OP: /* GE, GT, LE, LT, EQ, NE, MOD, PTR */
buf->token = T_BINARY_OPERATOR;
break;
case RWT_STYPE: /* BYTE, WORD, FAR, NEAR, FAR16, NEAR32 ... */
buf->token = T_STYPE;
break;
case RWT_RES_ID: /* DUP, ADDR, FLAT, VARARG, language types [, FRAME (64-bit)] */
buf->token = T_RES_ID;
break;
default: /* shouldn't happen */
DebugMsg(("get_id: error, unknown type in SpecialTable[%u]=%u\n", index, SpecialTable[index].type ));
/**/myassert( 0 );
buf->token = T_ID;
buf->idarg = 0;
break;
}
return( NOT_ERROR );
}
/* get one token.
* possible return values: NOT_ERROR, ERROR, EMPTY.
*
* names beginning with '.' are difficult to detect,
* because the dot is a binary operator. The rules to
* accept a "dotted" name are:
* 1.- a valid ID char is to follow the dot
* 2.- if buffer index is > 0, then the previous item
* must not be a reg, ), ] or an ID.
* [bx.abc] -> . is an operator
* ([bx]).abc -> . is an operator
* [bx].abc -> . is an operator
* varname.abc -> . is an operator
*/
#define is_valid_id_start( ch ) ( isalpha(ch) || ch=='_' || ch=='@' || ch=='$' || ch=='?' )
ret_code GetToken( struct asm_tok token[], struct line_status *p )
/****************************************************************/
{
if( isdigit( *p->input ) ) {
return( get_number( token, p ) );
} else if( is_valid_id_start( *p->input ) ) {
return( get_id( token, p ) );
} else if( *p->input == '.' &&
#if DOTNAMEX /* allow dots within identifiers */
( is_valid_id_char(*(p->input+1)) || *(p->input+1) == '.' ) &&
#else
is_valid_id_char(*(p->input+1)) &&
#endif
/* v2.11: member last_token has been removed */
//( p->last_token != T_REG && p->last_token != T_CL_BRACKET && p->last_token != T_CL_SQ_BRACKET && p->last_token != T_ID ) ) {
( p->index == 0 || ( token[-1].token != T_REG && token[-1].token != T_CL_BRACKET && token[-1].token != T_CL_SQ_BRACKET && token[-1].token != T_ID ) ) ) {
return( get_id( token, p ) );
#if BACKQUOTES
} else if( *p->input == '`' && Options.strict_masm_compat == FALSE ) {
return( get_id_in_backquotes( token, p ) );
#endif
}
return( get_special_symbol( token, p ) );
}
// fixme char *IfSymbol; /* save symbols in IFDEF's so they don't get expanded */
static void StartComment( const char *p )
/***************************************/
{
while ( isspace( *p ) ) p++;
if ( *p == NULLC ) {
EmitError( COMMENT_DELIMITER_EXPECTED );
return;
}
ModuleInfo.inside_comment = *p++;
if( strchr( p, ModuleInfo.inside_comment ) )
ModuleInfo.inside_comment = NULLC;
return;
}
int Tokenize( char *line, unsigned int start, struct asm_tok tokenarray[], unsigned int flags )
/*********************************************************************************************/
/*
* create tokens from a source line.
* line: the line which is to be tokenized
* start: where to start in the token buffer. If start == 0,
* then some variables are additionally initialized.
* flags: 1=if the line has been tokenized already.
*/
{
int rc;
struct line_status p;
p.input = line;
p.start = line;
p.index = start;
//p.last_token = T_FINAL; /* v2.11: last_token is obsolete */
p.flags = flags;
p.flags2 = 0;
p.flags3 = 0;
if ( p.index == 0 ) {
#ifdef DEBUG_OUT
cnttok0++;
#endif
/* v2.06: these flags are now initialized on a higher level */
//ModuleInfo.line_flags = 0;
p.output = token_stringbuf;
if( ModuleInfo.inside_comment ) {
DebugMsg1(("COMMENT active, delim is >%c<, line is >%s<\n", ModuleInfo.inside_comment, line));
if( strchr( line, ModuleInfo.inside_comment ) != NULL ) {
DebugMsg1(("COMMENT mode exited\n"));
ModuleInfo.inside_comment = NULLC;
}
goto skipline;
}
/* v2.08: expansion operator % at pos 0 is handled differently.
*/
//while( isspace( *p.input )) p.input++;
//if ( *p.input == '%' ) {
// *p.input++ = ' ';
// expansion = TRUE;
//}
} else {
#ifdef DEBUG_OUT
cnttok1++;
#endif
p.output = StringBufferEnd;
}
for( ;; ) {
while( isspace( *p.input ) ) p.input++;
if ( *p.input == ';' && flags == TOK_DEFAULT ) {
while ( p.input > line && isspace( *(p.input-1) ) ) p.input--; /* skip */
strcpy( commentbuffer, p.input );
ModuleInfo.CurrComment = commentbuffer;
*p.input = NULLC;
}
tokenarray[p.index].tokpos = p.input;
if( *p.input == NULLC ) {
/* if a comma is last token, concat lines ... with some exceptions
* v2.05: moved from PreprocessLine(). Moved because the
* concatenation may be triggered by a comma AFTER expansion.
*/
if ( p.index > 1 &&
tokenarray[p.index-1].token == T_COMMA
#if FASTPASS
&& ( Parse_Pass == PASS_1 || UseSavedState == FALSE ) /* is it an already preprocessed line? */
#endif
&& start == 0 ) {
DebugMsg1(("Tokenize: calling IsMultiLine()\n" ));
if ( IsMultiLine( tokenarray ) ) {
char *ptr = GetAlignedPointer( p.output, strlen( p.output ) );
DebugMsg1(("Tokenize: IsMultiLine(%s)=TRUE\n", line ));
if ( GetTextLine( ptr ) ) {
while ( isspace( *ptr ) ) ptr++;
if ( *ptr ) {
strcpy( p.input, ptr );
if ( strlen( p.start ) >= MAX_LINE_LEN ) {
EmitError( LINE_TOO_LONG );
p.index = start;
break;
}
DebugMsg1(("Tokenize: line concatenation, line=%s\n", line ));
continue;
}
}
}
}
break;
}
tokenarray[p.index].string_ptr = p.output;
rc = GetToken( &tokenarray[p.index], &p );
if ( rc == EMPTY )
continue;
if ( rc == ERROR ) {
p.index = start; /* skip this line */
break;
}
/* v2.04: this has been moved here from condasm.c to
* avoid problems with (conditional) listings. It also
* avoids having to search for the first token twice.
* Note: a conditional assembly directive within an
* inactive block and preceded by a label isn't detected!
* This is an exact copy of the Masm behavior, although
* it probably is just a bug!
*/
if ( !(flags & TOK_RESCAN) ) {
if ( p.index == 0 || ( p.index == 2 && ( tokenarray[1].token == T_COLON || tokenarray[1].token == T_DBL_COLON) ) ) {
if ( tokenarray[p.index].token == T_DIRECTIVE &&
tokenarray[p.index].bytval == DRT_CONDDIR ) {
if ( tokenarray[p.index].tokval == T_COMMENT ) {
DebugMsg1(("tokenize: COMMENT starting, delim is >%c<\n", ModuleInfo.inside_comment));
StartComment( p.input );
break; /* p.index is 0 or 2 */
}
conditional_assembly_prepare( tokenarray[p.index].tokval );
if ( CurrIfState != BLOCK_ACTIVE ) {
p.index++;
break; /* p.index is 1 or 3 */
}
} else if( CurrIfState != BLOCK_ACTIVE ) {
/* further processing skipped. p.index is 0 */
break;
}
}
}
//p.last_token = tokenarray[p.index].token; /* v2.11: last_token is obsolete */
p.index++;
if( p.index >= MAX_TOKEN ) {
DebugMsg1(("tokenize: token index %u >= MAX_TOKEN (=%u), line=>%s<\n", p.index, MAX_TOKEN, line ));
EmitError( TOO_MANY_TOKENS );
p.index = start;
goto skipline;
}
#if TOKSTRALIGN
p.output = GetAlignedPointer( token_stringbuf, p.output - token_stringbuf );
#endif
}
#if TOKSTRALIGN
p.output = GetAlignedPointer( token_stringbuf, p.output - token_stringbuf );
#endif
StringBufferEnd = p.output;
skipline:
tokenarray[p.index].token = T_FINAL;
tokenarray[p.index].bytval = p.flags3;
tokenarray[p.index].string_ptr = "";
return( p.index );
}