mirror of
https://github.com/NishiOwO/JWasm.git
synced 2025-04-22 01:04:39 +00:00
1088 lines
38 KiB
C
1088 lines
38 KiB
C
/****************************************************************************
|
|
*
|
|
* Open Watcom Project
|
|
*
|
|
* Portions Copyright (c) 1983-2002 Sybase, Inc. All Rights Reserved.
|
|
*
|
|
* ========================================================================
|
|
*
|
|
* This file contains Original Code and/or Modifications of Original
|
|
* Code as defined in and that are subject to the Sybase Open Watcom
|
|
* Public License version 1.0 (the 'License'). You may not use this file
|
|
* except in compliance with the License. BY USING THIS FILE YOU AGREE TO
|
|
* ALL TERMS AND CONDITIONS OF THE LICENSE. A copy of the License is
|
|
* provided with the Original Code and Modifications, and is also
|
|
* available at www.sybase.com/developer/opensource.
|
|
*
|
|
* The Original Code and all software distributed under the License are
|
|
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
|
|
* EXPRESS OR IMPLIED, AND SYBASE AND ALL CONTRIBUTORS HEREBY DISCLAIM
|
|
* ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR
|
|
* NON-INFRINGEMENT. Please see the License for the specific language
|
|
* governing rights and limitations under the License.
|
|
*
|
|
* ========================================================================
|
|
*
|
|
* Description: tokenizer.
|
|
*
|
|
****************************************************************************/
|
|
|
|
#include <ctype.h>
|
|
|
|
#include "globals.h"
|
|
#include "memalloc.h"
|
|
#include "parser.h"
|
|
#include "condasm.h"
|
|
#include "reswords.h"
|
|
#include "input.h"
|
|
#include "segment.h"
|
|
#include "listing.h"
|
|
#include "tokenize.h"
|
|
#include "fastpass.h"
|
|
#include "myassert.h"
|
|
|
|
#define CONCATID 0 /* 0=most compatible (see backsl.asm) */
|
|
#define MASMNUMBER 1 /* 1=Masm-compatible number scanning */
|
|
#ifdef __I86__
|
|
#define TOKSTRALIGN 0 /* 0=don't align token strings */
|
|
#else
|
|
#define TOKSTRALIGN 1 /* 1=align token strings to sizeof(uint_32) */
|
|
#endif
|
|
|
|
#ifndef DOTNAMEX /* v2.08: added */
|
|
/* set DOTNAMEX to 1 if support for Intel C++ generated assembly code
|
|
* is to be enabled.
|
|
*/
|
|
#define DOTNAMEX 0
|
|
#endif
|
|
|
|
extern struct ReservedWord ResWordTable[];
|
|
|
|
#ifdef DEBUG_OUT
|
|
int_32 cnttok0;
|
|
int_32 cnttok1;
|
|
extern struct asm_tok *end_tokenarray;
|
|
extern char *end_stringbuf;
|
|
#endif
|
|
extern char *token_stringbuf; /* start token string buffer */
|
|
extern char *commentbuffer;
|
|
|
|
/* v2.08: moved to struct line_status */
|
|
//static uint_8 g_flags; /* directive flags for current line */
|
|
|
|
#if !defined(__GNUC__) && !defined(__POCC__)
|
|
#define tolower(c) ((c >= 'A' && c <= 'Z') ? c | 0x20 : c )
|
|
#endif
|
|
|
|
/* strings for token 0x28 - 0x2F */
|
|
static const short stokstr1[] = {
|
|
'(',')','*','+',',','-','.','/'};
|
|
/* strings for token 0x5B - 0x5D */
|
|
static const short stokstr2[] = {
|
|
'[',0,']'};
|
|
|
|
/* test line concatenation if last token is a comma.
|
|
* dont concat EQU, macro invocations or
|
|
* - ECHO
|
|
* - FORC/IRPC (v2.0)
|
|
* - INCLUDE (v2.8)
|
|
* lines!
|
|
* v2.05: don't concat if line's an instruction.
|
|
*/
|
|
static bool IsMultiLine( struct asm_tok tokenarray[] )
|
|
/****************************************************/
|
|
{
|
|
struct asym *sym;
|
|
int i;
|
|
|
|
if ( tokenarray[1].token == T_DIRECTIVE && tokenarray[1].tokval == T_EQU )
|
|
return( FALSE );
|
|
i = ( tokenarray[1].token == T_COLON ? 2 : 0 );
|
|
/* don't concat macros */
|
|
if ( tokenarray[i].token == T_ID ) {
|
|
sym = SymSearch( tokenarray[i].string_ptr );
|
|
if ( sym && ( sym->state == SYM_MACRO )
|
|
#if VARARGML
|
|
&& sym->mac_multiline == FALSE /* v2.11: added */
|
|
#endif
|
|
)
|
|
return( FALSE );
|
|
} else if ( tokenarray[i].token == T_INSTRUCTION ||
|
|
( tokenarray[i].token == T_DIRECTIVE &&
|
|
( tokenarray[i].tokval == T_ECHO ||
|
|
tokenarray[i].tokval == T_INCLUDE ||
|
|
tokenarray[i].tokval == T_FORC ||
|
|
tokenarray[i].tokval == T_IRPC ) ) ) {
|
|
return( FALSE );
|
|
}
|
|
return( TRUE );
|
|
}
|
|
|
|
static ret_code get_float( struct asm_tok *buf, struct line_status *p )
|
|
/*********************************************************************/
|
|
{
|
|
/* valid floats look like: (int)[.(int)][e(int)]
|
|
* Masm also allows hex format, terminated by 'r' (3F800000r)
|
|
*/
|
|
|
|
char got_decimal = FALSE;
|
|
char got_e = FALSE;
|
|
char *ptr = p->input;
|
|
|
|
for( ; *ptr != NULLC; ptr++ ) {
|
|
char c = *ptr;
|
|
if( isdigit( c ) ) {
|
|
;
|
|
} else if ( c == '.' && got_decimal == FALSE ) {
|
|
got_decimal = TRUE;
|
|
} else if ( tolower( c ) == 'e' && got_e == FALSE ) {
|
|
got_e = TRUE;
|
|
/* accept e+2 / e-4 /etc. */
|
|
if ( *(ptr+1) == '+' || *(ptr+1) == '-' )
|
|
ptr++;
|
|
/* it's accepted if there's no digit behind 'e' */
|
|
//if ( !isdigit( *(ptr+1) ) )
|
|
// break;
|
|
} else
|
|
break;
|
|
}
|
|
|
|
buf->token = T_FLOAT;
|
|
buf->floattype = NULLC;
|
|
memcpy( p->output, p->input, ptr - p->input );
|
|
p->output += ( ptr - p->input );
|
|
*p->output++ = NULLC;
|
|
p->input = ptr;
|
|
|
|
/* the binary value isn't used currently */
|
|
//*((float *)(&buf->value)) = atof( buf->string_ptr );
|
|
|
|
return( NOT_ERROR );
|
|
}
|
|
|
|
static ret_code ConcatLine( char *src, int cnt, char *out, struct line_status *ls )
|
|
/*********************************************************************************/
|
|
{
|
|
char *p = src+1;
|
|
int max;
|
|
|
|
while ( isspace(*p) ) p++;
|
|
if ( *p == NULLC || *p == ';' ) {
|
|
//char *buffer = GetAlignedPointer( out, strlen( out ) );
|
|
char *buffer = out;
|
|
if( GetTextLine( buffer ) ) {
|
|
p = buffer;
|
|
/* skip leading spaces */
|
|
while ( isspace( *p ) ) p++;
|
|
max = strlen( p );
|
|
if ( cnt == 0 )
|
|
*src++ = ' ';
|
|
if ( ( src - ls->start ) + max >= MAX_LINE_LEN ) {
|
|
EmitError( LINE_TOO_LONG );
|
|
max = MAX_LINE_LEN - ( src - ls->start + 1 );
|
|
*(p+max) = NULLC;
|
|
}
|
|
memcpy( src, p, max+1 );
|
|
return( NOT_ERROR );
|
|
}
|
|
}
|
|
return( EMPTY );
|
|
}
|
|
|
|
static ret_code get_string( struct asm_tok *buf, struct line_status *p )
|
|
/**********************************************************************/
|
|
{
|
|
char symbol_o;
|
|
char symbol_c;
|
|
char c;
|
|
char *src = p->input;
|
|
char *dst = p->output;
|
|
int count = 0;
|
|
int level;
|
|
|
|
symbol_o = *src;
|
|
|
|
switch( symbol_o ) {
|
|
case '"':
|
|
case '\'':
|
|
buf->string_delim = symbol_o;
|
|
*dst++ = symbol_o;
|
|
src++;
|
|
for ( ; count < MAX_STRING_LEN; src++, count++ ) {
|
|
c = *src;
|
|
if( c == symbol_o ) { /* another quote? */
|
|
*dst++ = c; /* store it */
|
|
src++;
|
|
if( *src != c )
|
|
break; /* exit loop */
|
|
/* a pair of quotes inside the string is
|
|
* handled as a single quote */
|
|
} else if( c == NULLC ) {
|
|
/* missing terminating quote, change to undelimited string */
|
|
buf->string_delim = NULLC;
|
|
count++; /* count the first quote */
|
|
break;
|
|
} else {
|
|
*dst++ = c;
|
|
}
|
|
}
|
|
break; /* end of string marker is the same */
|
|
case '{':
|
|
if ( p->flags & TOK_NOCURLBRACES )
|
|
goto undelimited_string;
|
|
case '<':
|
|
buf->string_delim = symbol_o;
|
|
symbol_c = ( symbol_o == '<' ? '>' : '}' );
|
|
src++;
|
|
for( level = 0; count < MAX_STRING_LEN; ) {
|
|
c = *src;
|
|
if( c == symbol_o ) { /* < or { ? */
|
|
level++;
|
|
*dst++ = c; src++;
|
|
count++;
|
|
} else if( c == symbol_c ) { /* > or }? */
|
|
if( level ) {
|
|
level--;
|
|
*dst++ = c; src++;
|
|
count++;
|
|
} else {
|
|
/* store the string delimiter unless it is <> */
|
|
/* v2.08: don't store delimiters for {}-literals */
|
|
//if (symbol_o != '<')
|
|
// *dst++ = c;
|
|
src++;
|
|
break; /* exit loop */
|
|
}
|
|
#if 1
|
|
/*
|
|
a " or ' inside a <>/{} string? Since it's not a must that
|
|
[double-]quotes are paired in a literal it must be done
|
|
directive-dependant!
|
|
see: IFIDN <">,<">
|
|
*/
|
|
} else if( ( c == '"' || c == '\'' ) && ( p->flags2 & DF_STRPARM ) == 0 ) {
|
|
char delim = c;
|
|
char *tdst;
|
|
char *tsrc;
|
|
int tcount;
|
|
*dst++ = c; src++;
|
|
count++;
|
|
tdst = dst;
|
|
tsrc = src;
|
|
tcount = count;
|
|
while (*src != delim && *src != NULLC && count < MAX_STRING_LEN-1 ) {
|
|
if ( symbol_o == '<' && *src == '!' && *(src+1) != NULLC )
|
|
src++;
|
|
*dst++ = *src++;
|
|
count++;
|
|
}
|
|
if ( *src == delim ) {
|
|
*dst++ = *src++;
|
|
count++;
|
|
continue;
|
|
} else {
|
|
/* restore values */
|
|
src = tsrc;
|
|
dst = tdst;
|
|
count = tcount;
|
|
}
|
|
#endif
|
|
} else if( c == '!' && symbol_o == '<' && *(src+1) ) {
|
|
/* handle literal-character operator '!'.
|
|
* it makes the next char to enter the literal uninterpreted.
|
|
*/
|
|
/* v2.09: don't store the '!' */
|
|
//*dst++ = c; src++;
|
|
//count++;
|
|
//if ( count == MAX_STRING_LEN )
|
|
// break;
|
|
src++;
|
|
*dst++ = *src++;
|
|
count++;
|
|
} else if( c == '\\' && ConcatLine( src, count, dst, p ) != EMPTY ) {
|
|
p->flags3 |= TF3_ISCONCAT;
|
|
} else if( c == NULLC || ( c == ';' && symbol_o == '{' )) {
|
|
if ( p->flags == TOK_DEFAULT && (( p->flags2 & DF_NOCONCAT ) == 0 ) ) { /* <{ */
|
|
/* if last nonspace character was a comma
|
|
* get next line and continue string scan
|
|
*/
|
|
char *tmp = dst-1;
|
|
while ( isspace(*tmp) ) tmp--;
|
|
if ( *tmp == ',' ) {
|
|
DebugMsg1(("Tokenize.get_string: comma concatenation: %s\n", src ));
|
|
tmp = GetAlignedPointer( p->output, strlen( p->output ) );
|
|
if( GetTextLine( tmp ) ) {
|
|
/* skip leading spaces */
|
|
while ( isspace( *tmp ) ) tmp++;
|
|
/* this size check isn't fool-proved yet */
|
|
if ( strlen( tmp ) + count >= MAX_LINE_LEN ) {
|
|
EmitError( LINE_TOO_LONG );
|
|
return( ERROR );
|
|
}
|
|
strcpy( src, tmp );
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
src = p->input;
|
|
dst = p->output;
|
|
*dst++ = *src++;
|
|
count = 1;
|
|
goto undelimited_string;
|
|
} else {
|
|
*dst++ = c; src++;
|
|
count++;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
undelimited_string:
|
|
buf->string_delim = NULLC;
|
|
/* this is an undelimited string,
|
|
* so just copy it until we hit something that looks like the end.
|
|
* this format is used by the INCLUDE directive, but may also
|
|
* occur inside the string macros!
|
|
*/
|
|
/* v2.05: also stop if a ')' is found - see literal2.asm regression test */
|
|
//for( count = 0 ; count < MAX_STRING_LEN && *src != NULLC && !isspace( *src ) && *src != ',' && *src != ';'; ) {
|
|
for( ; count < MAX_STRING_LEN &&
|
|
/* v2.08: stop also at < and % */
|
|
//*src != NULLC && !isspace( *src ) && *src != ',' && *src != ';' && *src != ')'; ) {
|
|
//*src && !isspace( *src ) && *src != ',' && *src != ')' && *src != '<' && *src != '%'; ) {
|
|
*src && !isspace( *src ) && *src != ',' && *src != ')' && *src != '%'; ) {
|
|
if ( *src == ';' && p->flags == TOK_DEFAULT )
|
|
break;
|
|
/* v2.11: handle '\' also for expanded lines */
|
|
//if ( *src == '\\' && !( p->flags & TOK_NOCURLBRACES ) ) {
|
|
if ( *src == '\\' && ( p->flags == TOK_DEFAULT || ( p->flags & TOK_LINE ) ) ) {
|
|
if ( ConcatLine( src, count, dst, p ) != EMPTY ) {
|
|
DebugMsg1(("Tokenize.get_string: backslash concatenation: >%s<\n", src ));
|
|
p->flags3 |= TF3_ISCONCAT;
|
|
if ( count )
|
|
continue;
|
|
return( EMPTY );
|
|
}
|
|
}
|
|
/* v2.08: handle '!' operator */
|
|
if ( *src == '!' && *(src+1) && count < MAX_STRING_LEN - 1 )
|
|
*dst++ = *src++;
|
|
*dst++ = *src++;
|
|
count++;
|
|
}
|
|
break;
|
|
}
|
|
|
|
if ( count == MAX_STRING_LEN ) {
|
|
EmitError( STRING_OR_TEXT_LITERAL_TOO_LONG );
|
|
return( ERROR );
|
|
}
|
|
*dst++ = NULLC;
|
|
buf->token = T_STRING;
|
|
buf->stringlen = count;
|
|
p->input = src;
|
|
p->output = dst;
|
|
return( NOT_ERROR );
|
|
}
|
|
|
|
static ret_code get_special_symbol( struct asm_tok *buf, struct line_status *p )
|
|
/******************************************************************************/
|
|
{
|
|
char symbol;
|
|
//int i;
|
|
|
|
symbol = *p->input;
|
|
switch( symbol ) {
|
|
case ':' : /* T_COLON binary operator (0x3A) */
|
|
p->input++;
|
|
if ( *p->input == ':' ) {
|
|
p->input++;
|
|
buf->token = T_DBL_COLON;
|
|
buf->string_ptr = "::";
|
|
} else {
|
|
buf->token = T_COLON;
|
|
buf->string_ptr = ":";
|
|
}
|
|
break;
|
|
case '%' : /* T_PERCENT (0x25) */
|
|
#if PERCENT_OUT
|
|
/* %OUT directive? */
|
|
if ( ( _memicmp( p->input+1, "OUT", 3 ) == 0 ) && !is_valid_id_char( *(p->input+4) ) ) {
|
|
buf->token = T_DIRECTIVE;
|
|
buf->tokval = T_ECHO;
|
|
buf->dirtype = DRT_ECHO;
|
|
memcpy( p->output, p->input, 4 );
|
|
p->input += 4;
|
|
p->output += 4;
|
|
*(p->output)++ = NULLC;
|
|
break;
|
|
}
|
|
#endif
|
|
p->input++;
|
|
if ( p->flags == TOK_DEFAULT && p->index == 0 ) {
|
|
p->flags3 |= TF3_EXPANSION;
|
|
return( EMPTY );
|
|
}
|
|
buf->token = T_PERCENT;
|
|
buf->string_ptr = "%";
|
|
break;
|
|
case '(' : /* 0x28: T_OP_BRACKET operator - needs a matching ')' */
|
|
/* v2.11: reset c-expression flag if a macro function call is detected */
|
|
if ( ( p->flags2 & DF_CEXPR ) && p->index && (buf-1)->token == T_ID ) {
|
|
struct asym *sym = SymSearch( (buf-1)->string_ptr );
|
|
if ( sym && ( sym->state == SYM_MACRO ) && sym->isfunc )
|
|
p->flags2 &= ~DF_CEXPR;
|
|
}
|
|
/* no break */
|
|
case ')' : /* 0x29: T_CL_BRACKET */
|
|
case '*' : /* 0x2A: binary operator */
|
|
case '+' : /* 0x2B: unary|binary operator */
|
|
case ',' : /* 0x2C: T_COMMA */
|
|
case '-' : /* 0x2D: unary|binary operator */
|
|
case '.' : /* 0x2E: T_DOT binary operator */
|
|
case '/' : /* 0x2F: binary operator */
|
|
/* all of these are themselves a token */
|
|
p->input++;
|
|
buf->token = symbol;
|
|
buf->specval = 0; /* initialize, in case the token needs extra data */
|
|
/* v2.06: use constants for the token string */
|
|
buf->string_ptr = (char *)&stokstr1[symbol - '('];
|
|
break;
|
|
case '[' : /* T_OP_SQ_BRACKET operator - needs a matching ']' (0x5B) */
|
|
case ']' : /* T_CL_SQ_BRACKET (0x5D) */
|
|
p->input++;
|
|
buf->token = symbol;
|
|
/* v2.06: use constants for the token string */
|
|
buf->string_ptr = (char *)&stokstr2[symbol - '['];
|
|
break;
|
|
case '=' : /* (0x3D) */
|
|
if ( *(p->input+1) != '=' ) {
|
|
buf->token = T_DIRECTIVE;
|
|
buf->tokval = T_EQU;
|
|
buf->dirtype = DRT_EQUALSGN; /* to make it differ from EQU directive */
|
|
buf->string_ptr = "=";
|
|
p->input++;
|
|
break;
|
|
}
|
|
/* fall through */
|
|
default:
|
|
/* detect C style operators.
|
|
* DF_CEXPR is set if .IF, .WHILE, .ELSEIF or .UNTIL
|
|
* has been detected in the current line.
|
|
* will catch: '!', '<', '>', '&', '==', '!=', '<=', '>=', '&&', '||'
|
|
* A single '|' will also be caught, although it isn't a valid
|
|
* operator - it will cause a 'operator expected' error msg later.
|
|
* the tokens are stored as one- or two-byte sized "strings".
|
|
*/
|
|
if ( ( p->flags2 & DF_CEXPR ) && strchr( "=!<>&|", symbol ) ) {
|
|
*(p->output)++ = symbol;
|
|
p->input++;
|
|
buf->stringlen = 1;
|
|
if ( symbol == '&' || symbol == '|' ) {
|
|
if ( *p->input == symbol ) {
|
|
*(p->output)++ = symbol;
|
|
p->input++;
|
|
buf->stringlen = 2;
|
|
}
|
|
} else if ( *p->input == '=' ) {
|
|
*(p->output)++ = '=';
|
|
p->input++;
|
|
buf->stringlen = 2;
|
|
}
|
|
buf->token = T_STRING;
|
|
buf->string_delim = NULLC;
|
|
*(p->output)++ = NULLC;
|
|
break;
|
|
}
|
|
/* v2.08: ampersand is a special token */
|
|
if ( symbol == '&' ) {
|
|
p->input++;
|
|
buf->token = '&';
|
|
buf->string_ptr = "&";
|
|
break;
|
|
}
|
|
/* anything we don't recognise we will consider a string,
|
|
* delimited by space characters, commas, newlines or nulls
|
|
*/
|
|
return( get_string( buf, p ) );
|
|
}
|
|
return( NOT_ERROR );
|
|
}
|
|
|
|
#if 0
|
|
static void array_mul_add( unsigned char *buf, unsigned base, unsigned num, unsigned size )
|
|
/*****************************************************************************************/
|
|
{
|
|
while( size-- > 0 ) {
|
|
num += *buf * base;
|
|
*(buf++) = num;
|
|
num >>= 8;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/* read in a number.
|
|
* check the number suffix:
|
|
* b or y: base 2
|
|
* d or t: base 10
|
|
* h: base 16
|
|
* o or q: base 8
|
|
*/
|
|
static ret_code get_number( struct asm_tok *buf, struct line_status *p )
|
|
/**********************************************************************/
|
|
{
|
|
char *ptr = p->input;
|
|
char *dig_start;
|
|
char *dig_end;
|
|
unsigned base = 0;
|
|
unsigned len;
|
|
uint_32 digits_seen;
|
|
char last_char;
|
|
|
|
#define VALID_BINARY 0x0003
|
|
#define VALID_OCTAL 0x00ff
|
|
#define VALID_DECIMAL 0x03ff
|
|
#define OK_NUM( t ) ((digits_seen & ~VALID_##t) == 0)
|
|
|
|
digits_seen = 0;
|
|
#if CHEXPREFIX
|
|
if( *ptr == '0' && (tolower( *(ptr+1) ) == 'x' ) ) {
|
|
ptr += 2;
|
|
base = 16;
|
|
}
|
|
#endif
|
|
dig_start = ptr;
|
|
for( ;; ptr++ ) {
|
|
if (*ptr >= '0' && *ptr <= '9')
|
|
digits_seen |= 1 << (*ptr - '0');
|
|
else {
|
|
last_char = tolower( *ptr );
|
|
if ( last_char >= 'a' && last_char <= 'f' )
|
|
digits_seen |= 1 << ( last_char + 10 - 'a' );
|
|
else
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* note that a float MUST contain a dot.
|
|
* 1234e78 is NOT a valid float
|
|
*/
|
|
if ( last_char == '.' )
|
|
return( get_float( buf, p ) );
|
|
|
|
#if 0
|
|
/* v2.08: if suffix isn't followed by a non-id char, don't use it! */
|
|
if ( last_char && is_valid_id_char( *(ptr+1) ) ) {
|
|
last_char = NULLC;
|
|
while ( *(ptr-1) > '9' )
|
|
ptr--;
|
|
digits_seen &= 0x3FF;
|
|
|
|
}
|
|
#endif
|
|
|
|
#if CHEXPREFIX
|
|
if ( base != 0 ) {
|
|
dig_end = ptr;
|
|
if ( digits_seen == 0 )
|
|
base = 0;
|
|
} else
|
|
#endif
|
|
switch( last_char ) {
|
|
case 'r': /* a float with the "real number designator" */
|
|
buf->token = T_FLOAT;
|
|
buf->floattype = 'r';
|
|
ptr++;
|
|
goto number_done;
|
|
case 'h':
|
|
base = 16;
|
|
dig_end = ptr;
|
|
ptr++;
|
|
break;
|
|
//case 'b':
|
|
case 'y':
|
|
if( OK_NUM( BINARY ) ) {
|
|
base = 2;
|
|
dig_end = ptr;
|
|
ptr++;
|
|
}
|
|
break;
|
|
//case 'd':
|
|
case 't':
|
|
if( OK_NUM( DECIMAL ) ) {
|
|
base = 10;
|
|
dig_end = ptr;
|
|
ptr++;
|
|
}
|
|
break;
|
|
case 'q':
|
|
case 'o':
|
|
if( OK_NUM( OCTAL ) ) {
|
|
base = 8;
|
|
dig_end = ptr;
|
|
ptr++;
|
|
}
|
|
break;
|
|
default:
|
|
last_char = tolower( *(ptr-1) );
|
|
if ( ( last_char == 'b' || last_char == 'd' ) && digits_seen >= ( 1UL << ModuleInfo.radix ) ) {
|
|
char *tmp = dig_start;
|
|
char max = ( last_char == 'b' ? '1' : '9' );
|
|
for ( dig_end = ptr-1; tmp < dig_end && *tmp <= max; tmp++ );
|
|
if ( tmp == dig_end ) {
|
|
base = ( last_char == 'b' ? 2 : 10 );
|
|
break;
|
|
}
|
|
}
|
|
dig_end = ptr;
|
|
#if COCTALS
|
|
if( Options.allow_c_octals && *dig_start == '0' ) {
|
|
if( OK_NUM( OCTAL ) ) {
|
|
base = 8;
|
|
break;
|
|
}
|
|
}
|
|
#endif
|
|
/* radix max. digits_seen
|
|
-----------------------------------------------------------
|
|
2 3 2^2-1 (0,1)
|
|
8 255 2^8-1 (0,1,2,3,4,5,6,7)
|
|
10 1023 2^10-1 (0,1,2,3,4,5,6,7,8,9)
|
|
16 65535 2^16-1 (0,1,2,3,4,5,6,7,8,9,a,b,c,d,e,f)
|
|
*/
|
|
if ( digits_seen < (1UL << ModuleInfo.radix) )
|
|
base = ModuleInfo.radix;
|
|
break;
|
|
}
|
|
|
|
#if MASMNUMBER
|
|
/* Masm doesn't swallow alphanum chars which may follow the
|
|
* number!
|
|
*/
|
|
if ( base != 0 ) {
|
|
#else
|
|
if ( base != 0 && is_valid_id_char( *ptr ) == FALSE ) {
|
|
#endif
|
|
buf->token = T_NUM;
|
|
buf->numbase = base;
|
|
buf->itemlen = dig_end - dig_start;
|
|
//DebugMsg(("get_number: inp=%s, value=%" I32_SPEC "X\n", p->input, buf->value64 ));
|
|
} else {
|
|
buf->token = T_BAD_NUM;
|
|
DebugMsg(("get_number: BAD_NUMBER (%s), radix=%u, base=%u, ptr=>%s<, digits_seen=%Xh\n", dig_start, ModuleInfo.radix, base, ptr, digits_seen ));
|
|
/* swallow remainder of token */
|
|
while( is_valid_id_char( *ptr ) ) ++ptr;
|
|
}
|
|
number_done:
|
|
len = ptr - p->input;
|
|
memcpy( p->output, p->input, len );
|
|
|
|
p->output += len;
|
|
*p->output++ = NULLC;
|
|
p->input = ptr;
|
|
|
|
return( NOT_ERROR );
|
|
}
|
|
|
|
#if BACKQUOTES
|
|
static ret_code get_id_in_backquotes( struct asm_tok *buf, struct line_status *p )
|
|
/********************************************************************************/
|
|
{
|
|
char *optr = p->output;
|
|
buf->token = T_ID;
|
|
buf->idarg = 0;
|
|
|
|
p->input++; /* strip off the backquotes */
|
|
for( ; *p->input != '`'; ) {
|
|
if( *p->input == NULLC || *p->input == ';' ) {
|
|
*p->output = NULLC;
|
|
EmitErr( BACKQUOTE_MISSING, p->output );
|
|
return( ERROR );
|
|
}
|
|
*optr++ = *p->input++;
|
|
}
|
|
p->input++; /* skip the terminating '`' */
|
|
*optr++ = NULLC;
|
|
p->output = optr;
|
|
return( NOT_ERROR );
|
|
}
|
|
#endif
|
|
|
|
/* get an ID. will always return NOT_ERROR. */
|
|
|
|
static ret_code get_id( struct asm_tok *buf, struct line_status *p )
|
|
/******************************************************************/
|
|
{
|
|
//struct ReservedWord *resw;
|
|
char *src = p->input;
|
|
char *dst = p->output;
|
|
int index;
|
|
unsigned size;
|
|
|
|
#if CONCATID || DOTNAMEX
|
|
continue_scan:
|
|
#endif
|
|
do {
|
|
*dst++ = *src++;
|
|
} while ( is_valid_id_char( *src ) );
|
|
#if CONCATID
|
|
/* v2.05: in case there's a backslash right behind
|
|
* the ID, check if a line concatenation is to occur.
|
|
* If yes, and the first char of the concatenated line
|
|
* is also a valid ID char, continue to scan the name.
|
|
* Problem: it's ok for EQU, but less good for other directives.
|
|
*/
|
|
if ( *src == '\\' ) {
|
|
if ( ConcatLine( src, src - p->input, dst, p ) != EMPTY ) {
|
|
p->concat = TRUE;
|
|
if ( is_valid_id_char( *src ) )
|
|
goto continue_scan;
|
|
}
|
|
}
|
|
#endif
|
|
#if DOTNAMEX
|
|
/* if the name starts with a dot or underscore, then accept dots
|
|
* within the name (though not as last char). OPTION DOTNAME
|
|
* must be on.
|
|
*/
|
|
if ( *src == '.' && ModuleInfo.dotname &&
|
|
( *(p->output) == '.' || *(p->output) == '_' ) &&
|
|
( is_valid_id_char(*(src+1)) || *(src+1) == '.' ) )
|
|
goto continue_scan;
|
|
#endif
|
|
/* v2.04: check added */
|
|
size = dst - p->output;
|
|
if ( size > MAX_ID_LEN ) {
|
|
EmitErr( IDENTIFIER_TOO_LONG );
|
|
dst = p->output + MAX_ID_LEN;
|
|
}
|
|
*dst++ = NULLC;
|
|
|
|
/* now decide what to do with it */
|
|
|
|
if( size == 1 && *p->output == '?' ) {
|
|
p->input = src;
|
|
buf->token = T_QUESTION_MARK;
|
|
buf->string_ptr = "?";
|
|
return( NOT_ERROR );
|
|
}
|
|
index = FindResWord( p->output, size );
|
|
if( index == 0 ) {
|
|
/* if ID begins with a DOT, check for OPTION DOTNAME.
|
|
* if not set, skip the token and return a T_DOT instead!
|
|
*/
|
|
if ( *p->output == '.' && ModuleInfo.dotname == FALSE ) {
|
|
buf->token = T_DOT;
|
|
buf->string_ptr = (char *)&stokstr1['.' - '('];
|
|
p->input++;
|
|
return( NOT_ERROR );
|
|
}
|
|
p->input = src;
|
|
p->output = dst;
|
|
buf->token = T_ID;
|
|
buf->idarg = 0;
|
|
return( NOT_ERROR );
|
|
}
|
|
p->input = src;
|
|
p->output = dst;
|
|
buf->tokval = index; /* is a enum instr_token value */
|
|
/* v2.11: RWF_SPECIAL now obsolete */
|
|
//if ( ! ( ResWordTable[index].flags & RWF_SPECIAL ) ) {
|
|
if ( index >= SPECIAL_LAST ) {
|
|
|
|
// DebugMsg(("found item >%s< in instruction table, rm=%X\n", buf->string_ptr, InstrTable[index].rm_byte));
|
|
|
|
/* if -Zm is set, the following from the Masm docs is relevant:
|
|
*
|
|
* Reserved Keywords Dependent on CPU Mode with OPTION M510
|
|
*
|
|
* With OPTION M510, keywords and instructions not available in the
|
|
* current CPU mode (such as ENTER under .8086) are not treated as
|
|
* keywords. This also means the USE32, FLAT, FAR32, and NEAR32 segment
|
|
* types and the 80386/486 registers are not keywords with a processor
|
|
* selection less than .386.
|
|
* If you remove OPTION M510, any reserved word used as an identifier
|
|
* generates a syntax error. You can either rename the identifiers or
|
|
* use OPTION NOKEYWORD. For more information on OPTION NOKEYWORD, see
|
|
* OPTION NOKEYWORD, later in this appendix.
|
|
*
|
|
* The current implementation of this rule below is likely to be improved.
|
|
*/
|
|
if ( ModuleInfo.m510 ) {
|
|
/* checking the cpu won't give the expected results currently since
|
|
* some instructions in the table (i.e. MOV) start with a 386 variant!
|
|
*/
|
|
index = IndexFromToken( buf->tokval );
|
|
#if 0 /* changed for v1.96 */
|
|
if (( InstrTable[index].cpu & P_EXT_MASK ) > ( ModuleInfo.curr_cpu & P_EXT_MASK )) {
|
|
#else
|
|
if (( InstrTable[index].cpu & P_CPU_MASK ) > ( ModuleInfo.curr_cpu & P_CPU_MASK ) ||
|
|
( InstrTable[index].cpu & P_EXT_MASK ) > ( ModuleInfo.curr_cpu & P_EXT_MASK )) {
|
|
#endif
|
|
buf->token = T_ID;
|
|
buf->idarg = 0;
|
|
return( NOT_ERROR );
|
|
}
|
|
}
|
|
buf->token = T_INSTRUCTION;
|
|
return( NOT_ERROR );
|
|
}
|
|
index = buf->tokval;
|
|
|
|
/* for RWT_SPECIAL, field <bytval> contains further infos:
|
|
- RWT_REG: register number (regnum)
|
|
- RWT_DIRECTIVE: type of directive (dirtype)
|
|
- RWT_UNARY_OPERATOR: operator precedence
|
|
- RWT_BINARY_OPERATOR: operator precedence
|
|
- RWT_STYPE: memtype
|
|
- RWT_RES_ID: for languages, LANG_xxx value
|
|
for the rest, unused.
|
|
*/
|
|
buf->bytval = SpecialTable[index].bytval;
|
|
|
|
switch ( SpecialTable[index].type ) {
|
|
case RWT_REG:
|
|
buf->token = T_REG;
|
|
break;
|
|
case RWT_DIRECTIVE:
|
|
buf->token = T_DIRECTIVE;
|
|
if ( p->flags2 == 0 )
|
|
p->flags2 = SpecialTable[index].value;
|
|
break;
|
|
case RWT_UNARY_OP: /* OFFSET, LOW, HIGH, LOWWORD, HIGHWORD, SHORT, ... */
|
|
buf->token = T_UNARY_OPERATOR;
|
|
break;
|
|
case RWT_BINARY_OP: /* GE, GT, LE, LT, EQ, NE, MOD, PTR */
|
|
buf->token = T_BINARY_OPERATOR;
|
|
break;
|
|
case RWT_STYPE: /* BYTE, WORD, FAR, NEAR, FAR16, NEAR32 ... */
|
|
buf->token = T_STYPE;
|
|
break;
|
|
case RWT_RES_ID: /* DUP, ADDR, FLAT, VARARG, language types [, FRAME (64-bit)] */
|
|
buf->token = T_RES_ID;
|
|
break;
|
|
default: /* shouldn't happen */
|
|
DebugMsg(("get_id: error, unknown type in SpecialTable[%u]=%u\n", index, SpecialTable[index].type ));
|
|
/**/myassert( 0 );
|
|
buf->token = T_ID;
|
|
buf->idarg = 0;
|
|
break;
|
|
}
|
|
return( NOT_ERROR );
|
|
}
|
|
|
|
/* get one token.
|
|
* possible return values: NOT_ERROR, ERROR, EMPTY.
|
|
*
|
|
* names beginning with '.' are difficult to detect,
|
|
* because the dot is a binary operator. The rules to
|
|
* accept a "dotted" name are:
|
|
* 1.- a valid ID char is to follow the dot
|
|
* 2.- if buffer index is > 0, then the previous item
|
|
* must not be a reg, ), ] or an ID.
|
|
* [bx.abc] -> . is an operator
|
|
* ([bx]).abc -> . is an operator
|
|
* [bx].abc -> . is an operator
|
|
* varname.abc -> . is an operator
|
|
*/
|
|
|
|
#define is_valid_id_start( ch ) ( isalpha(ch) || ch=='_' || ch=='@' || ch=='$' || ch=='?' )
|
|
|
|
ret_code GetToken( struct asm_tok token[], struct line_status *p )
|
|
/****************************************************************/
|
|
{
|
|
if( isdigit( *p->input ) ) {
|
|
return( get_number( token, p ) );
|
|
} else if( is_valid_id_start( *p->input ) ) {
|
|
return( get_id( token, p ) );
|
|
} else if( *p->input == '.' &&
|
|
#if DOTNAMEX /* allow dots within identifiers */
|
|
( is_valid_id_char(*(p->input+1)) || *(p->input+1) == '.' ) &&
|
|
#else
|
|
is_valid_id_char(*(p->input+1)) &&
|
|
#endif
|
|
/* v2.11: member last_token has been removed */
|
|
//( p->last_token != T_REG && p->last_token != T_CL_BRACKET && p->last_token != T_CL_SQ_BRACKET && p->last_token != T_ID ) ) {
|
|
( p->index == 0 || ( token[-1].token != T_REG && token[-1].token != T_CL_BRACKET && token[-1].token != T_CL_SQ_BRACKET && token[-1].token != T_ID ) ) ) {
|
|
return( get_id( token, p ) );
|
|
#if BACKQUOTES
|
|
} else if( *p->input == '`' && Options.strict_masm_compat == FALSE ) {
|
|
return( get_id_in_backquotes( token, p ) );
|
|
#endif
|
|
}
|
|
return( get_special_symbol( token, p ) );
|
|
}
|
|
|
|
// fixme char *IfSymbol; /* save symbols in IFDEF's so they don't get expanded */
|
|
|
|
static void StartComment( const char *p )
|
|
/***************************************/
|
|
{
|
|
while ( isspace( *p ) ) p++;
|
|
if ( *p == NULLC ) {
|
|
EmitError( COMMENT_DELIMITER_EXPECTED );
|
|
return;
|
|
}
|
|
ModuleInfo.inside_comment = *p++;
|
|
if( strchr( p, ModuleInfo.inside_comment ) )
|
|
ModuleInfo.inside_comment = NULLC;
|
|
return;
|
|
}
|
|
|
|
int Tokenize( char *line, unsigned int start, struct asm_tok tokenarray[], unsigned int flags )
|
|
/*********************************************************************************************/
|
|
/*
|
|
* create tokens from a source line.
|
|
* line: the line which is to be tokenized
|
|
* start: where to start in the token buffer. If start == 0,
|
|
* then some variables are additionally initialized.
|
|
* flags: 1=if the line has been tokenized already.
|
|
*/
|
|
{
|
|
int rc;
|
|
struct line_status p;
|
|
|
|
p.input = line;
|
|
p.start = line;
|
|
p.index = start;
|
|
//p.last_token = T_FINAL; /* v2.11: last_token is obsolete */
|
|
p.flags = flags;
|
|
p.flags2 = 0;
|
|
p.flags3 = 0;
|
|
if ( p.index == 0 ) {
|
|
#ifdef DEBUG_OUT
|
|
cnttok0++;
|
|
#endif
|
|
/* v2.06: these flags are now initialized on a higher level */
|
|
//ModuleInfo.line_flags = 0;
|
|
p.output = token_stringbuf;
|
|
if( ModuleInfo.inside_comment ) {
|
|
DebugMsg1(("COMMENT active, delim is >%c<, line is >%s<\n", ModuleInfo.inside_comment, line));
|
|
if( strchr( line, ModuleInfo.inside_comment ) != NULL ) {
|
|
DebugMsg1(("COMMENT mode exited\n"));
|
|
ModuleInfo.inside_comment = NULLC;
|
|
}
|
|
goto skipline;
|
|
}
|
|
/* v2.08: expansion operator % at pos 0 is handled differently.
|
|
*/
|
|
//while( isspace( *p.input )) p.input++;
|
|
//if ( *p.input == '%' ) {
|
|
// *p.input++ = ' ';
|
|
// expansion = TRUE;
|
|
//}
|
|
} else {
|
|
#ifdef DEBUG_OUT
|
|
cnttok1++;
|
|
#endif
|
|
p.output = StringBufferEnd;
|
|
}
|
|
|
|
for( ;; ) {
|
|
|
|
while( isspace( *p.input ) ) p.input++;
|
|
|
|
if ( *p.input == ';' && flags == TOK_DEFAULT ) {
|
|
while ( p.input > line && isspace( *(p.input-1) ) ) p.input--; /* skip */
|
|
strcpy( commentbuffer, p.input );
|
|
ModuleInfo.CurrComment = commentbuffer;
|
|
*p.input = NULLC;
|
|
}
|
|
|
|
tokenarray[p.index].tokpos = p.input;
|
|
|
|
if( *p.input == NULLC ) {
|
|
/* if a comma is last token, concat lines ... with some exceptions
|
|
* v2.05: moved from PreprocessLine(). Moved because the
|
|
* concatenation may be triggered by a comma AFTER expansion.
|
|
*/
|
|
if ( p.index > 1 &&
|
|
tokenarray[p.index-1].token == T_COMMA
|
|
#if FASTPASS
|
|
&& ( Parse_Pass == PASS_1 || UseSavedState == FALSE ) /* is it an already preprocessed line? */
|
|
#endif
|
|
&& start == 0 ) {
|
|
DebugMsg1(("Tokenize: calling IsMultiLine()\n" ));
|
|
if ( IsMultiLine( tokenarray ) ) {
|
|
char *ptr = GetAlignedPointer( p.output, strlen( p.output ) );
|
|
DebugMsg1(("Tokenize: IsMultiLine(%s)=TRUE\n", line ));
|
|
if ( GetTextLine( ptr ) ) {
|
|
while ( isspace( *ptr ) ) ptr++;
|
|
if ( *ptr ) {
|
|
strcpy( p.input, ptr );
|
|
if ( strlen( p.start ) >= MAX_LINE_LEN ) {
|
|
EmitError( LINE_TOO_LONG );
|
|
p.index = start;
|
|
break;
|
|
}
|
|
DebugMsg1(("Tokenize: line concatenation, line=%s\n", line ));
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
tokenarray[p.index].string_ptr = p.output;
|
|
rc = GetToken( &tokenarray[p.index], &p );
|
|
if ( rc == EMPTY )
|
|
continue;
|
|
if ( rc == ERROR ) {
|
|
p.index = start; /* skip this line */
|
|
break;
|
|
}
|
|
/* v2.04: this has been moved here from condasm.c to
|
|
* avoid problems with (conditional) listings. It also
|
|
* avoids having to search for the first token twice.
|
|
* Note: a conditional assembly directive within an
|
|
* inactive block and preceded by a label isn't detected!
|
|
* This is an exact copy of the Masm behavior, although
|
|
* it probably is just a bug!
|
|
*/
|
|
if ( !(flags & TOK_RESCAN) ) {
|
|
if ( p.index == 0 || ( p.index == 2 && ( tokenarray[1].token == T_COLON || tokenarray[1].token == T_DBL_COLON) ) ) {
|
|
if ( tokenarray[p.index].token == T_DIRECTIVE &&
|
|
tokenarray[p.index].bytval == DRT_CONDDIR ) {
|
|
if ( tokenarray[p.index].tokval == T_COMMENT ) {
|
|
DebugMsg1(("tokenize: COMMENT starting, delim is >%c<\n", ModuleInfo.inside_comment));
|
|
StartComment( p.input );
|
|
break; /* p.index is 0 or 2 */
|
|
}
|
|
conditional_assembly_prepare( tokenarray[p.index].tokval );
|
|
if ( CurrIfState != BLOCK_ACTIVE ) {
|
|
p.index++;
|
|
break; /* p.index is 1 or 3 */
|
|
}
|
|
} else if( CurrIfState != BLOCK_ACTIVE ) {
|
|
/* further processing skipped. p.index is 0 */
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
//p.last_token = tokenarray[p.index].token; /* v2.11: last_token is obsolete */
|
|
p.index++;
|
|
if( p.index >= MAX_TOKEN ) {
|
|
DebugMsg1(("tokenize: token index %u >= MAX_TOKEN (=%u), line=>%s<\n", p.index, MAX_TOKEN, line ));
|
|
EmitError( TOO_MANY_TOKENS );
|
|
p.index = start;
|
|
goto skipline;
|
|
}
|
|
|
|
#if TOKSTRALIGN
|
|
p.output = GetAlignedPointer( token_stringbuf, p.output - token_stringbuf );
|
|
#endif
|
|
|
|
}
|
|
|
|
#if TOKSTRALIGN
|
|
p.output = GetAlignedPointer( token_stringbuf, p.output - token_stringbuf );
|
|
#endif
|
|
StringBufferEnd = p.output;
|
|
skipline:
|
|
tokenarray[p.index].token = T_FINAL;
|
|
tokenarray[p.index].bytval = p.flags3;
|
|
tokenarray[p.index].string_ptr = "";
|
|
return( p.index );
|
|
}
|
|
|