diff options
author | axc <axc@yandex-team.ru> | 2022-02-10 16:47:35 +0300 |
---|---|---|
committer | Daniil Cherednik <dcherednik@yandex-team.ru> | 2022-02-10 16:47:35 +0300 |
commit | 1f5217043ad70f25dc35e75b3bd261a1e23d045e (patch) | |
tree | 11bf68c1fa5272d3d3446cbd5a0ff96ed9d75788 /contrib/tools/bison/m4/src/input.c | |
parent | 69505a07cbb096113e85aa02e7d136cac4aa826c (diff) | |
download | ydb-1f5217043ad70f25dc35e75b3bd261a1e23d045e.tar.gz |
Restoring authorship annotation for <axc@yandex-team.ru>. Commit 1 of 2.
Diffstat (limited to 'contrib/tools/bison/m4/src/input.c')
-rw-r--r-- | contrib/tools/bison/m4/src/input.c | 2310 |
1 files changed, 1155 insertions, 1155 deletions
diff --git a/contrib/tools/bison/m4/src/input.c b/contrib/tools/bison/m4/src/input.c index 836d706489..5c8d3c835f 100644 --- a/contrib/tools/bison/m4/src/input.c +++ b/contrib/tools/bison/m4/src/input.c @@ -1,1156 +1,1156 @@ -/* GNU m4 -- A simple macro processor - - Copyright (C) 1989-1994, 2004-2013 Free Software Foundation, Inc. - - This file is part of GNU M4. - - GNU M4 is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - GNU M4 is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -/* Handling of different input sources, and lexical analysis. */ - -#include "m4.h" - -#include "memchr2.h" - -/* Unread input can be either files, that should be read (eg. included - files), strings, which should be rescanned (eg. macro expansion text), - or quoted macro definitions (as returned by the builtin "defn"). - Unread input are organised in a stack, implemented with an obstack. - Each input source is described by a "struct input_block". The obstack - is "current_input". The top of the input stack is "isp". - - The macro "m4wrap" places the text to be saved on another input - stack, on the obstack "wrapup_stack", whose top is "wsp". When EOF - is seen on normal input (eg, when "current_input" is empty), input is - switched over to "wrapup_stack", and the original "current_input" is - freed. A new stack is allocated for "wrapup_stack", which will - accept any text produced by calls to "m4wrap" from within the - wrapped text. This process of shuffling "wrapup_stack" to - "current_input" can continue indefinitely, even generating infinite - loops (e.g. "define(`f',`m4wrap(`f')')f"), without memory leaks. - - Pushing new input on the input stack is done by push_file (), - push_string (), push_wrapup () (for wrapup text), and push_macro () - (for macro definitions). Because macro expansion needs direct access - to the current input obstack (for optimisation), push_string () are - split in two functions, push_string_init (), which returns a pointer - to the current input stack, and push_string_finish (), which return a - pointer to the final text. The input_block *next is used to manage - the coordination between the different push routines. - - The current file and line number are stored in two global - variables, for use by the error handling functions in m4.c. Macro - expansion wants to report the line where a macro name was detected, - rather than where it finished collecting arguments. This also - applies to text resulting from macro expansions. So each input - block maintains its own notion of the current file and line, and - swapping between input blocks updates the global variables - accordingly. */ - -#ifdef ENABLE_CHANGEWORD +/* GNU m4 -- A simple macro processor + + Copyright (C) 1989-1994, 2004-2013 Free Software Foundation, Inc. + + This file is part of GNU M4. + + GNU M4 is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + GNU M4 is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* Handling of different input sources, and lexical analysis. */ + +#include "m4.h" + +#include "memchr2.h" + +/* Unread input can be either files, that should be read (eg. included + files), strings, which should be rescanned (eg. macro expansion text), + or quoted macro definitions (as returned by the builtin "defn"). + Unread input are organised in a stack, implemented with an obstack. + Each input source is described by a "struct input_block". The obstack + is "current_input". The top of the input stack is "isp". + + The macro "m4wrap" places the text to be saved on another input + stack, on the obstack "wrapup_stack", whose top is "wsp". When EOF + is seen on normal input (eg, when "current_input" is empty), input is + switched over to "wrapup_stack", and the original "current_input" is + freed. A new stack is allocated for "wrapup_stack", which will + accept any text produced by calls to "m4wrap" from within the + wrapped text. This process of shuffling "wrapup_stack" to + "current_input" can continue indefinitely, even generating infinite + loops (e.g. "define(`f',`m4wrap(`f')')f"), without memory leaks. + + Pushing new input on the input stack is done by push_file (), + push_string (), push_wrapup () (for wrapup text), and push_macro () + (for macro definitions). Because macro expansion needs direct access + to the current input obstack (for optimisation), push_string () are + split in two functions, push_string_init (), which returns a pointer + to the current input stack, and push_string_finish (), which return a + pointer to the final text. The input_block *next is used to manage + the coordination between the different push routines. + + The current file and line number are stored in two global + variables, for use by the error handling functions in m4.c. Macro + expansion wants to report the line where a macro name was detected, + rather than where it finished collecting arguments. This also + applies to text resulting from macro expansions. So each input + block maintains its own notion of the current file and line, and + swapping between input blocks updates the global variables + accordingly. */ + +#ifdef ENABLE_CHANGEWORD #include <contrib/tools/bison/gnulib/src/regex.h> -#endif - -enum input_type -{ - INPUT_STRING, /* String resulting from macro expansion. */ - INPUT_FILE, /* File from command line or include. */ - INPUT_MACRO /* Builtin resulting from defn. */ -}; - -typedef enum input_type input_type; - -struct input_block -{ - struct input_block *prev; /* previous input_block on the input stack */ - input_type type; /* see enum values */ - const char *file; /* file where this input is from */ - int line; /* line where this input is from */ - union - { - struct - { - char *string; /* remaining string value */ - char *end; /* terminating NUL of string */ - } - u_s; /* INPUT_STRING */ - struct - { - FILE *fp; /* input file handle */ - bool_bitfield end : 1; /* true if peek has seen EOF */ - bool_bitfield close : 1; /* true if we should close file on pop */ - bool_bitfield advance : 1; /* track previous start_of_input_line */ - } - u_f; /* INPUT_FILE */ - builtin_func *func; /* pointer to macro's function */ - } - u; -}; - -typedef struct input_block input_block; - - -/* Current input file name. */ -const char *current_file; - -/* Current input line number. */ -int current_line; - -/* Obstack for storing individual tokens. */ -static struct obstack token_stack; - -/* Obstack for storing file names. */ -static struct obstack file_names; - -/* Wrapup input stack. */ -static struct obstack *wrapup_stack; - -/* Current stack, from input or wrapup. */ -static struct obstack *current_input; - -/* Bottom of token_stack, for obstack_free. */ -static void *token_bottom; - -/* Pointer to top of current_input. */ -static input_block *isp; - -/* Pointer to top of wrapup_stack. */ -static input_block *wsp; - -/* Aux. for handling split push_string (). */ -static input_block *next; - -/* Flag for next_char () to increment current_line. */ -static bool start_of_input_line; - -/* Flag for next_char () to recognize change in input block. */ -static bool input_change; - -#define CHAR_EOF 256 /* character return on EOF */ -#define CHAR_MACRO 257 /* character return for MACRO token */ - -/* Quote chars. */ -STRING rquote; -STRING lquote; - -/* Comment chars. */ -STRING bcomm; -STRING ecomm; - -#ifdef ENABLE_CHANGEWORD - -# define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*" - -static struct re_pattern_buffer word_regexp; -static int default_word_regexp; -static struct re_registers regs; - -#else /* ! ENABLE_CHANGEWORD */ -# define default_word_regexp 1 -#endif /* ! ENABLE_CHANGEWORD */ - -#ifdef DEBUG_INPUT -static const char *token_type_string (token_type); -#endif - - -/*-------------------------------------------------------------------. -| push_file () pushes an input file on the input stack, saving the | -| current file name and line number. If next is non-NULL, this push | -| invalidates a call to push_string_init (), whose storage is | -| consequently released. If CLOSE_WHEN_DONE, then close FP after | -| EOF is detected. | -`-------------------------------------------------------------------*/ - -void -push_file (FILE *fp, const char *title, bool close_when_done) -{ - input_block *i; - - if (next != NULL) - { - obstack_free (current_input, next); - next = NULL; - } - - if (debug_level & DEBUG_TRACE_INPUT) - DEBUG_MESSAGE1 ("input read from %s", title); - - i = (input_block *) obstack_alloc (current_input, - sizeof (struct input_block)); - i->type = INPUT_FILE; - i->file = (char *) obstack_copy0 (&file_names, title, strlen (title)); - i->line = 1; - input_change = true; - - i->u.u_f.fp = fp; - i->u.u_f.end = false; - i->u.u_f.close = close_when_done; - i->u.u_f.advance = start_of_input_line; - output_current_line = -1; - - i->prev = isp; - isp = i; -} - -/*---------------------------------------------------------------. -| push_macro () pushes a builtin macro's definition on the input | -| stack. If next is non-NULL, this push invalidates a call to | -| push_string_init (), whose storage is consequently released. | -`---------------------------------------------------------------*/ - -void -push_macro (builtin_func *func) -{ - input_block *i; - - if (next != NULL) - { - obstack_free (current_input, next); - next = NULL; - } - - i = (input_block *) obstack_alloc (current_input, - sizeof (struct input_block)); - i->type = INPUT_MACRO; - i->file = current_file; - i->line = current_line; - input_change = true; - - i->u.func = func; - i->prev = isp; - isp = i; -} - -/*------------------------------------------------------------------. -| First half of push_string (). The pointer next points to the new | -| input_block. | -`------------------------------------------------------------------*/ - -struct obstack * -push_string_init (void) -{ - if (next != NULL) - { - M4ERROR ((warning_status, 0, - "INTERNAL ERROR: recursive push_string!")); - abort (); - } - - next = (input_block *) obstack_alloc (current_input, - sizeof (struct input_block)); - next->type = INPUT_STRING; - next->file = current_file; - next->line = current_line; - - return current_input; -} - -/*-------------------------------------------------------------------. -| Last half of push_string (). If next is now NULL, a call to | -| push_file () has invalidated the previous call to push_string_init | -| (), so we just give up. If the new object is void, we do not push | -| it. The function push_string_finish () returns a pointer to the | -| finished object. This pointer is only for temporary use, since | -| reading the next token might release the memory used for the | -| object. | -`-------------------------------------------------------------------*/ - -const char * -push_string_finish (void) -{ - const char *ret = NULL; - - if (next == NULL) - return NULL; - - if (obstack_object_size (current_input) > 0) - { - size_t len = obstack_object_size (current_input); - obstack_1grow (current_input, '\0'); - next->u.u_s.string = (char *) obstack_finish (current_input); - next->u.u_s.end = next->u.u_s.string + len; - next->prev = isp; - isp = next; - ret = isp->u.u_s.string; /* for immediate use only */ - input_change = true; - } - else - obstack_free (current_input, next); /* people might leave garbage on it. */ - next = NULL; - return ret; -} - -/*------------------------------------------------------------------. -| The function push_wrapup () pushes a string on the wrapup stack. | -| When the normal input stack gets empty, the wrapup stack will | -| become the input stack, and push_string () and push_file () will | -| operate on wrapup_stack. Push_wrapup should be done as | -| push_string (), but this will suffice, as long as arguments to | -| m4_m4wrap () are moderate in size. | -`------------------------------------------------------------------*/ - -void -push_wrapup (const char *s) -{ - size_t len = strlen (s); - input_block *i; - i = (input_block *) obstack_alloc (wrapup_stack, - sizeof (struct input_block)); - i->prev = wsp; - i->type = INPUT_STRING; - i->file = current_file; - i->line = current_line; - i->u.u_s.string = (char *) obstack_copy0 (wrapup_stack, s, len); - i->u.u_s.end = i->u.u_s.string + len; - wsp = i; -} - - -/*-------------------------------------------------------------------. -| The function pop_input () pops one level of input sources. If the | -| popped input_block is a file, current_file and current_line are | -| reset to the saved values before the memory for the input_block is | -| released. | -`-------------------------------------------------------------------*/ - -static void -pop_input (void) -{ - input_block *tmp = isp->prev; - - switch (isp->type) - { - case INPUT_STRING: - case INPUT_MACRO: - break; - - case INPUT_FILE: - if (debug_level & DEBUG_TRACE_INPUT) - { - if (tmp) - DEBUG_MESSAGE2 ("input reverted to %s, line %d", - tmp->file, tmp->line); - else - DEBUG_MESSAGE ("input exhausted"); - } - - if (ferror (isp->u.u_f.fp)) - { - M4ERROR ((warning_status, 0, "read error")); - if (isp->u.u_f.close) - fclose (isp->u.u_f.fp); - retcode = EXIT_FAILURE; - } - else if (isp->u.u_f.close && fclose (isp->u.u_f.fp) == EOF) - { - M4ERROR ((warning_status, errno, "error reading file")); - retcode = EXIT_FAILURE; - } - start_of_input_line = isp->u.u_f.advance; - output_current_line = -1; - break; - - default: - M4ERROR ((warning_status, 0, - "INTERNAL ERROR: input stack botch in pop_input ()")); - abort (); - } - obstack_free (current_input, isp); - next = NULL; /* might be set in push_string_init () */ - - isp = tmp; - input_change = true; -} - -/*-------------------------------------------------------------------. -| To switch input over to the wrapup stack, main calls pop_wrapup | -| (). Since wrapup text can install new wrapup text, pop_wrapup () | -| returns false when there is no wrapup text on the stack, and true | -| otherwise. | -`-------------------------------------------------------------------*/ - -bool -pop_wrapup (void) -{ - next = NULL; - obstack_free (current_input, NULL); - free (current_input); - - if (wsp == NULL) - { - /* End of the program. Free all memory even though we are about - to exit, since it makes leak detection easier. */ - obstack_free (&token_stack, NULL); - obstack_free (&file_names, NULL); - obstack_free (wrapup_stack, NULL); - free (wrapup_stack); -#ifdef ENABLE_CHANGEWORD - regfree (&word_regexp); -#endif /* ENABLE_CHANGEWORD */ - return false; - } - - current_input = wrapup_stack; - wrapup_stack = (struct obstack *) xmalloc (sizeof (struct obstack)); - obstack_init (wrapup_stack); - - isp = wsp; - wsp = NULL; - input_change = true; - - return true; -} - -/*-------------------------------------------------------------------. -| When a MACRO token is seen, next_token () uses init_macro_token () | -| to retrieve the value of the function pointer. | -`-------------------------------------------------------------------*/ - -static void -init_macro_token (token_data *td) -{ - if (isp->type != INPUT_MACRO) - { - M4ERROR ((warning_status, 0, - "INTERNAL ERROR: bad call to init_macro_token ()")); - abort (); - } - - TOKEN_DATA_TYPE (td) = TOKEN_FUNC; - TOKEN_DATA_FUNC (td) = isp->u.func; -} - - -/*-----------------------------------------------------------------. -| Low level input is done a character at a time. The function | -| peek_input () is used to look at the next character in the input | -| stream. At any given time, it reads from the input_block on the | -| top of the current input stack. | -`-----------------------------------------------------------------*/ - -static int -peek_input (void) -{ - int ch; - input_block *block = isp; - - while (1) - { - if (block == NULL) - return CHAR_EOF; - - switch (block->type) - { - case INPUT_STRING: - ch = to_uchar (block->u.u_s.string[0]); - if (ch != '\0') - return ch; - break; - - case INPUT_FILE: - ch = getc (block->u.u_f.fp); - if (ch != EOF) - { - ungetc (ch, block->u.u_f.fp); - return ch; - } - block->u.u_f.end = true; - break; - - case INPUT_MACRO: - return CHAR_MACRO; - - default: - M4ERROR ((warning_status, 0, - "INTERNAL ERROR: input stack botch in peek_input ()")); - abort (); - } - block = block->prev; - } -} - -/*-------------------------------------------------------------------. -| The function next_char () is used to read and advance the input to | -| the next character. It also manages line numbers for error | -| messages, so they do not get wrong, due to lookahead. The token | -| consisting of a newline alone is taken as belonging to the line it | -| ends, and the current line number is not incremented until the | -| next character is read. 99.9% of all calls will read from a | -| string, so factor that out into a macro for speed. | -`-------------------------------------------------------------------*/ - -#define next_char() \ - (isp && isp->type == INPUT_STRING && isp->u.u_s.string[0] \ - && !input_change \ - ? to_uchar (*isp->u.u_s.string++) \ - : next_char_1 ()) - -static int -next_char_1 (void) -{ - int ch; - - while (1) - { - if (isp == NULL) - { - current_file = ""; - current_line = 0; - return CHAR_EOF; - } - - if (input_change) - { - current_file = isp->file; - current_line = isp->line; - input_change = false; - } - - switch (isp->type) - { - case INPUT_STRING: - ch = to_uchar (*isp->u.u_s.string++); - if (ch != '\0') - return ch; - break; - - case INPUT_FILE: - if (start_of_input_line) - { - start_of_input_line = false; - current_line = ++isp->line; - } - - /* If stdin is a terminal, calling getc after peek_input - already called it would make the user have to hit ^D - twice to quit. */ - ch = isp->u.u_f.end ? EOF : getc (isp->u.u_f.fp); - if (ch != EOF) - { - if (ch == '\n') - start_of_input_line = true; - return ch; - } - break; - - case INPUT_MACRO: - pop_input (); /* INPUT_MACRO input sources has only one token */ - return CHAR_MACRO; - - default: - M4ERROR ((warning_status, 0, - "INTERNAL ERROR: input stack botch in next_char ()")); - abort (); - } - - /* End of input source --- pop one level. */ - pop_input (); - } -} - -/*-------------------------------------------------------------------. -| skip_line () simply discards all immediately following characters, | -| upto the first newline. It is only used from m4_dnl (). | -`-------------------------------------------------------------------*/ - -void -skip_line (void) -{ - int ch; - const char *file = current_file; - int line = current_line; - - while ((ch = next_char ()) != CHAR_EOF && ch != '\n') - ; - if (ch == CHAR_EOF) - /* current_file changed to "" if we see CHAR_EOF, use the - previous value we stored earlier. */ - M4ERROR_AT_LINE ((warning_status, 0, file, line, - "Warning: end of file treated as newline")); - /* On the rare occasion that dnl crosses include file boundaries - (either the input file did not end in a newline, or changeword - was used), calling next_char can update current_file and - current_line, and that update will be undone as we return to - expand_macro. This informs next_char to fix things again. */ - if (file != current_file || line != current_line) - input_change = true; -} - - -/*------------------------------------------------------------------. -| This function is for matching a string against a prefix of the | -| input stream. If the string matches the input and consume is | -| true, the input is discarded; otherwise any characters read are | -| pushed back again. The function is used only when multicharacter | -| quotes or comment delimiters are used. | -`------------------------------------------------------------------*/ - -static bool -match_input (const char *s, bool consume) -{ - int n; /* number of characters matched */ - int ch; /* input character */ - const char *t; - bool result = false; - - ch = peek_input (); - if (ch != to_uchar (*s)) - return false; /* fail */ - - if (s[1] == '\0') - { - if (consume) - next_char (); - return true; /* short match */ - } - - next_char (); - for (n = 1, t = s++; peek_input () == to_uchar (*s++); ) - { - next_char (); - n++; - if (*s == '\0') /* long match */ - { - if (consume) - return true; - result = true; - break; - } - } - - /* Failed or shouldn't consume, push back input. */ - { - struct obstack *h = push_string_init (); - - /* `obstack_grow' may be macro evaluating its arg 1 several times. */ - obstack_grow (h, t, n); - } - push_string_finish (); - return result; -} - -/*--------------------------------------------------------------------. -| The macro MATCH() is used to match a string S against the input. | -| The first character is handled inline, for speed. Hopefully, this | -| will not hurt efficiency too much when single character quotes and | -| comment delimiters are used. If CONSUME, then CH is the result of | -| next_char, and a successful match will discard the matched string. | -| Otherwise, CH is the result of peek_char, and the input stream is | -| effectively unchanged. | -`--------------------------------------------------------------------*/ - -#define MATCH(ch, s, consume) \ - (to_uchar ((s)[0]) == (ch) \ - && (ch) != '\0' \ - && ((s)[1] == '\0' || (match_input ((s) + (consume), consume)))) - - -/*--------------------------------------------------------. -| Initialize input stacks, and quote/comment characters. | -`--------------------------------------------------------*/ - -void -input_init (void) -{ - current_file = ""; - current_line = 0; - - current_input = (struct obstack *) xmalloc (sizeof (struct obstack)); - obstack_init (current_input); - wrapup_stack = (struct obstack *) xmalloc (sizeof (struct obstack)); - obstack_init (wrapup_stack); - - obstack_init (&file_names); - - /* Allocate an object in the current chunk, so that obstack_free - will always work even if the first token parsed spills to a new - chunk. */ - obstack_init (&token_stack); - obstack_alloc (&token_stack, 1); - token_bottom = obstack_base (&token_stack); - - isp = NULL; - wsp = NULL; - next = NULL; - - start_of_input_line = false; - - lquote.string = xstrdup (DEF_LQUOTE); - lquote.length = strlen (lquote.string); - rquote.string = xstrdup (DEF_RQUOTE); - rquote.length = strlen (rquote.string); - bcomm.string = xstrdup (DEF_BCOMM); - bcomm.length = strlen (bcomm.string); - ecomm.string = xstrdup (DEF_ECOMM); - ecomm.length = strlen (ecomm.string); - -#ifdef ENABLE_CHANGEWORD - set_word_regexp (user_word_regexp); -#endif -} - - -/*------------------------------------------------------------------. -| Functions for setting quotes and comment delimiters. Used by | -| m4_changecom () and m4_changequote (). Pass NULL if the argument | -| was not present, to distinguish from an explicit empty string. | -`------------------------------------------------------------------*/ - -void -set_quotes (const char *lq, const char *rq) -{ - free (lquote.string); - free (rquote.string); - - /* POSIX states that with 0 arguments, the default quotes are used. - POSIX XCU ERN 112 states that behavior is implementation-defined - if there was only one argument, or if there is an empty string in - either position when there are two arguments. We allow an empty - left quote to disable quoting, but a non-empty left quote will - always create a non-empty right quote. See the texinfo for what - some other implementations do. */ - if (!lq) - { - lq = DEF_LQUOTE; - rq = DEF_RQUOTE; - } - else if (!rq || (*lq && !*rq)) - rq = DEF_RQUOTE; - - lquote.string = xstrdup (lq); - lquote.length = strlen (lquote.string); - rquote.string = xstrdup (rq); - rquote.length = strlen (rquote.string); -} - -void -set_comment (const char *bc, const char *ec) -{ - free (bcomm.string); - free (ecomm.string); - - /* POSIX requires no arguments to disable comments. It requires - empty arguments to be used as-is, but this is counter to - traditional behavior, because a non-null begin and null end makes - it impossible to end a comment. An aardvark has been filed: - http://www.opengroup.org/austin/mailarchives/ag-review/msg02168.html - This implementation assumes the aardvark will be approved. See - the texinfo for what some other implementations do. */ - if (!bc) - bc = ec = ""; - else if (!ec || (*bc && !*ec)) - ec = DEF_ECOMM; - - bcomm.string = xstrdup (bc); - bcomm.length = strlen (bcomm.string); - ecomm.string = xstrdup (ec); - ecomm.length = strlen (ecomm.string); -} - -#ifdef ENABLE_CHANGEWORD - -void -set_word_regexp (const char *regexp) -{ - const char *msg; - struct re_pattern_buffer new_word_regexp; - - if (!*regexp || STREQ (regexp, DEFAULT_WORD_REGEXP)) - { - default_word_regexp = true; - return; - } - - /* Dry run to see whether the new expression is compilable. */ - init_pattern_buffer (&new_word_regexp, NULL); - msg = re_compile_pattern (regexp, strlen (regexp), &new_word_regexp); - regfree (&new_word_regexp); - - if (msg != NULL) - { - M4ERROR ((warning_status, 0, - "bad regular expression `%s': %s", regexp, msg)); - return; - } - - /* If compilation worked, retry using the word_regexp struct. We - can't rely on struct assigns working, so redo the compilation. - The fastmap can be reused between compilations, and will be freed - by the final regfree. */ - if (!word_regexp.fastmap) - word_regexp.fastmap = xcharalloc (UCHAR_MAX + 1); - msg = re_compile_pattern (regexp, strlen (regexp), &word_regexp); - assert (!msg); - re_set_registers (&word_regexp, ®s, regs.num_regs, regs.start, regs.end); - if (re_compile_fastmap (&word_regexp)) - assert (false); - - default_word_regexp = false; -} - -#endif /* ENABLE_CHANGEWORD */ - - -/*--------------------------------------------------------------------. -| Parse and return a single token from the input stream. A token | -| can either be TOKEN_EOF, if the input_stack is empty; it can be | -| TOKEN_STRING for a quoted string; TOKEN_WORD for something that is | -| a potential macro name; and TOKEN_SIMPLE for any single character | -| that is not a part of any of the previous types. If LINE is not | -| NULL, set *LINE to the line where the token starts. | -| | -| Next_token () return the token type, and passes back a pointer to | -| the token data through TD. The token text is collected on the | -| obstack token_stack, which never contains more than one token text | -| at a time. The storage pointed to by the fields in TD is | -| therefore subject to change the next time next_token () is called. | -`--------------------------------------------------------------------*/ - -token_type -next_token (token_data *td, int *line) -{ - int ch; - int quote_level; - token_type type; -#ifdef ENABLE_CHANGEWORD - int startpos; - char *orig_text = NULL; -#endif - const char *file; - int dummy; - - obstack_free (&token_stack, token_bottom); - if (!line) - line = &dummy; - - /* Can't consume character until after CHAR_MACRO is handled. */ - ch = peek_input (); - if (ch == CHAR_EOF) - { -#ifdef DEBUG_INPUT - xfprintf (stderr, "next_token -> EOF\n"); -#endif - next_char (); - return TOKEN_EOF; - } - if (ch == CHAR_MACRO) - { - init_macro_token (td); - next_char (); -#ifdef DEBUG_INPUT - xfprintf (stderr, "next_token -> MACDEF (%s)\n", - find_builtin_by_addr (TOKEN_DATA_FUNC (td))->name); -#endif - return TOKEN_MACDEF; - } - - next_char (); /* Consume character we already peeked at. */ - file = current_file; - *line = current_line; - if (MATCH (ch, bcomm.string, true)) - { - obstack_grow (&token_stack, bcomm.string, bcomm.length); - while ((ch = next_char ()) != CHAR_EOF - && !MATCH (ch, ecomm.string, true)) - obstack_1grow (&token_stack, ch); - if (ch != CHAR_EOF) - obstack_grow (&token_stack, ecomm.string, ecomm.length); - else - /* current_file changed to "" if we see CHAR_EOF, use the - previous value we stored earlier. */ - M4ERROR_AT_LINE ((EXIT_FAILURE, 0, file, *line, - "ERROR: end of file in comment")); - - type = TOKEN_STRING; - } - else if (default_word_regexp && (isalpha (ch) || ch == '_')) - { - obstack_1grow (&token_stack, ch); - while ((ch = peek_input ()) != CHAR_EOF && (isalnum (ch) || ch == '_')) - { - obstack_1grow (&token_stack, ch); - next_char (); - } - type = TOKEN_WORD; - } - -#ifdef ENABLE_CHANGEWORD - - else if (!default_word_regexp && word_regexp.fastmap[ch]) - { - obstack_1grow (&token_stack, ch); - while (1) - { - ch = peek_input (); - if (ch == CHAR_EOF) - break; - obstack_1grow (&token_stack, ch); - startpos = re_search (&word_regexp, - (char *) obstack_base (&token_stack), - obstack_object_size (&token_stack), 0, 0, - ®s); - if (startpos || - regs.end [0] != (regoff_t) obstack_object_size (&token_stack)) - { - *(((char *) obstack_base (&token_stack) - + obstack_object_size (&token_stack)) - 1) = '\0'; - break; - } - next_char (); - } - - obstack_1grow (&token_stack, '\0'); - orig_text = (char *) obstack_finish (&token_stack); - - if (regs.start[1] != -1) - obstack_grow (&token_stack,orig_text + regs.start[1], - regs.end[1] - regs.start[1]); - else - obstack_grow (&token_stack, orig_text,regs.end[0]); - - type = TOKEN_WORD; - } - -#endif /* ENABLE_CHANGEWORD */ - - else if (!MATCH (ch, lquote.string, true)) - { - switch (ch) - { - case '(': - type = TOKEN_OPEN; - break; - case ',': - type = TOKEN_COMMA; - break; - case ')': - type = TOKEN_CLOSE; - break; - default: - type = TOKEN_SIMPLE; - break; - } - obstack_1grow (&token_stack, ch); - } - else - { - bool fast = lquote.length == 1 && rquote.length == 1; - quote_level = 1; - while (1) - { - /* Try scanning a buffer first. */ - const char *buffer = (isp && isp->type == INPUT_STRING - ? isp->u.u_s.string : NULL); - if (buffer && *buffer) - { - size_t len = isp->u.u_s.end - buffer; - const char *p = buffer; - do - { - p = (char *) memchr2 (p, *lquote.string, *rquote.string, - buffer + len - p); - } - while (p && fast && (*p++ == *rquote.string - ? --quote_level : ++quote_level)); - if (p) - { - if (fast) - { - assert (!quote_level); - obstack_grow (&token_stack, buffer, p - buffer - 1); - isp->u.u_s.string += p - buffer; - break; - } - obstack_grow (&token_stack, buffer, p - buffer); - ch = to_uchar (*p); - isp->u.u_s.string += p - buffer + 1; - } - else - { - obstack_grow (&token_stack, buffer, len); - isp->u.u_s.string += len; - continue; - } - } - /* Fall back to a byte. */ - else - ch = next_char (); - if (ch == CHAR_EOF) - /* current_file changed to "" if we see CHAR_EOF, use - the previous value we stored earlier. */ - M4ERROR_AT_LINE ((EXIT_FAILURE, 0, file, *line, - "ERROR: end of file in string")); - - if (MATCH (ch, rquote.string, true)) - { - if (--quote_level == 0) - break; - obstack_grow (&token_stack, rquote.string, rquote.length); - } - else if (MATCH (ch, lquote.string, true)) - { - quote_level++; - obstack_grow (&token_stack, lquote.string, lquote.length); - } - else - obstack_1grow (&token_stack, ch); - } - type = TOKEN_STRING; - } - - obstack_1grow (&token_stack, '\0'); - - TOKEN_DATA_TYPE (td) = TOKEN_TEXT; - TOKEN_DATA_TEXT (td) = (char *) obstack_finish (&token_stack); -#ifdef ENABLE_CHANGEWORD - if (orig_text == NULL) - orig_text = TOKEN_DATA_TEXT (td); - TOKEN_DATA_ORIG_TEXT (td) = orig_text; -#endif -#ifdef DEBUG_INPUT - xfprintf (stderr, "next_token -> %s (%s)\n", - token_type_string (type), TOKEN_DATA_TEXT (td)); -#endif - return type; -} - -/*-----------------------------------------------. -| Peek at the next token from the input stream. | -`-----------------------------------------------*/ - -token_type -peek_token (void) -{ - token_type result; - int ch = peek_input (); - - if (ch == CHAR_EOF) - { - result = TOKEN_EOF; - } - else if (ch == CHAR_MACRO) - { - result = TOKEN_MACDEF; - } - else if (MATCH (ch, bcomm.string, false)) - { - result = TOKEN_STRING; - } - else if ((default_word_regexp && (isalpha (ch) || ch == '_')) -#ifdef ENABLE_CHANGEWORD - || (! default_word_regexp && word_regexp.fastmap[ch]) -#endif /* ENABLE_CHANGEWORD */ - ) - { - result = TOKEN_WORD; - } - else if (MATCH (ch, lquote.string, false)) - { - result = TOKEN_STRING; - } - else - switch (ch) - { - case '(': - result = TOKEN_OPEN; - break; - case ',': - result = TOKEN_COMMA; - break; - case ')': - result = TOKEN_CLOSE; - break; - default: - result = TOKEN_SIMPLE; - } - -#ifdef DEBUG_INPUT - xfprintf (stderr, "peek_token -> %s\n", token_type_string (result)); -#endif /* DEBUG_INPUT */ - return result; -} - - -#ifdef DEBUG_INPUT - -static const char * -token_type_string (token_type t) -{ - switch (t) - { /* TOKSW */ - case TOKEN_EOF: - return "EOF"; - case TOKEN_STRING: - return "STRING"; - case TOKEN_WORD: - return "WORD"; - case TOKEN_OPEN: - return "OPEN"; - case TOKEN_COMMA: - return "COMMA"; - case TOKEN_CLOSE: - return "CLOSE"; - case TOKEN_SIMPLE: - return "SIMPLE"; - case TOKEN_MACDEF: - return "MACDEF"; - default: - abort (); - } - } - -static void -print_token (const char *s, token_type t, token_data *td) -{ - xfprintf (stderr, "%s: ", s); - switch (t) - { /* TOKSW */ - case TOKEN_OPEN: - case TOKEN_COMMA: - case TOKEN_CLOSE: - case TOKEN_SIMPLE: - xfprintf (stderr, "char:"); - break; - - case TOKEN_WORD: - xfprintf (stderr, "word:"); - break; - - case TOKEN_STRING: - xfprintf (stderr, "string:"); - break; - - case TOKEN_MACDEF: - xfprintf (stderr, "macro: %p\n", TOKEN_DATA_FUNC (td)); - break; - - case TOKEN_EOF: - xfprintf (stderr, "eof\n"); - break; - } - xfprintf (stderr, "\t\"%s\"\n", TOKEN_DATA_TEXT (td)); -} - -static void M4_GNUC_UNUSED -lex_debug (void) -{ - token_type t; - token_data td; - - while ((t = next_token (&td, NULL)) != TOKEN_EOF) - print_token ("lex", t, &td); -} -#endif /* DEBUG_INPUT */ +#endif + +enum input_type +{ + INPUT_STRING, /* String resulting from macro expansion. */ + INPUT_FILE, /* File from command line or include. */ + INPUT_MACRO /* Builtin resulting from defn. */ +}; + +typedef enum input_type input_type; + +struct input_block +{ + struct input_block *prev; /* previous input_block on the input stack */ + input_type type; /* see enum values */ + const char *file; /* file where this input is from */ + int line; /* line where this input is from */ + union + { + struct + { + char *string; /* remaining string value */ + char *end; /* terminating NUL of string */ + } + u_s; /* INPUT_STRING */ + struct + { + FILE *fp; /* input file handle */ + bool_bitfield end : 1; /* true if peek has seen EOF */ + bool_bitfield close : 1; /* true if we should close file on pop */ + bool_bitfield advance : 1; /* track previous start_of_input_line */ + } + u_f; /* INPUT_FILE */ + builtin_func *func; /* pointer to macro's function */ + } + u; +}; + +typedef struct input_block input_block; + + +/* Current input file name. */ +const char *current_file; + +/* Current input line number. */ +int current_line; + +/* Obstack for storing individual tokens. */ +static struct obstack token_stack; + +/* Obstack for storing file names. */ +static struct obstack file_names; + +/* Wrapup input stack. */ +static struct obstack *wrapup_stack; + +/* Current stack, from input or wrapup. */ +static struct obstack *current_input; + +/* Bottom of token_stack, for obstack_free. */ +static void *token_bottom; + +/* Pointer to top of current_input. */ +static input_block *isp; + +/* Pointer to top of wrapup_stack. */ +static input_block *wsp; + +/* Aux. for handling split push_string (). */ +static input_block *next; + +/* Flag for next_char () to increment current_line. */ +static bool start_of_input_line; + +/* Flag for next_char () to recognize change in input block. */ +static bool input_change; + +#define CHAR_EOF 256 /* character return on EOF */ +#define CHAR_MACRO 257 /* character return for MACRO token */ + +/* Quote chars. */ +STRING rquote; +STRING lquote; + +/* Comment chars. */ +STRING bcomm; +STRING ecomm; + +#ifdef ENABLE_CHANGEWORD + +# define DEFAULT_WORD_REGEXP "[_a-zA-Z][_a-zA-Z0-9]*" + +static struct re_pattern_buffer word_regexp; +static int default_word_regexp; +static struct re_registers regs; + +#else /* ! ENABLE_CHANGEWORD */ +# define default_word_regexp 1 +#endif /* ! ENABLE_CHANGEWORD */ + +#ifdef DEBUG_INPUT +static const char *token_type_string (token_type); +#endif + + +/*-------------------------------------------------------------------. +| push_file () pushes an input file on the input stack, saving the | +| current file name and line number. If next is non-NULL, this push | +| invalidates a call to push_string_init (), whose storage is | +| consequently released. If CLOSE_WHEN_DONE, then close FP after | +| EOF is detected. | +`-------------------------------------------------------------------*/ + +void +push_file (FILE *fp, const char *title, bool close_when_done) +{ + input_block *i; + + if (next != NULL) + { + obstack_free (current_input, next); + next = NULL; + } + + if (debug_level & DEBUG_TRACE_INPUT) + DEBUG_MESSAGE1 ("input read from %s", title); + + i = (input_block *) obstack_alloc (current_input, + sizeof (struct input_block)); + i->type = INPUT_FILE; + i->file = (char *) obstack_copy0 (&file_names, title, strlen (title)); + i->line = 1; + input_change = true; + + i->u.u_f.fp = fp; + i->u.u_f.end = false; + i->u.u_f.close = close_when_done; + i->u.u_f.advance = start_of_input_line; + output_current_line = -1; + + i->prev = isp; + isp = i; +} + +/*---------------------------------------------------------------. +| push_macro () pushes a builtin macro's definition on the input | +| stack. If next is non-NULL, this push invalidates a call to | +| push_string_init (), whose storage is consequently released. | +`---------------------------------------------------------------*/ + +void +push_macro (builtin_func *func) +{ + input_block *i; + + if (next != NULL) + { + obstack_free (current_input, next); + next = NULL; + } + + i = (input_block *) obstack_alloc (current_input, + sizeof (struct input_block)); + i->type = INPUT_MACRO; + i->file = current_file; + i->line = current_line; + input_change = true; + + i->u.func = func; + i->prev = isp; + isp = i; +} + +/*------------------------------------------------------------------. +| First half of push_string (). The pointer next points to the new | +| input_block. | +`------------------------------------------------------------------*/ + +struct obstack * +push_string_init (void) +{ + if (next != NULL) + { + M4ERROR ((warning_status, 0, + "INTERNAL ERROR: recursive push_string!")); + abort (); + } + + next = (input_block *) obstack_alloc (current_input, + sizeof (struct input_block)); + next->type = INPUT_STRING; + next->file = current_file; + next->line = current_line; + + return current_input; +} + +/*-------------------------------------------------------------------. +| Last half of push_string (). If next is now NULL, a call to | +| push_file () has invalidated the previous call to push_string_init | +| (), so we just give up. If the new object is void, we do not push | +| it. The function push_string_finish () returns a pointer to the | +| finished object. This pointer is only for temporary use, since | +| reading the next token might release the memory used for the | +| object. | +`-------------------------------------------------------------------*/ + +const char * +push_string_finish (void) +{ + const char *ret = NULL; + + if (next == NULL) + return NULL; + + if (obstack_object_size (current_input) > 0) + { + size_t len = obstack_object_size (current_input); + obstack_1grow (current_input, '\0'); + next->u.u_s.string = (char *) obstack_finish (current_input); + next->u.u_s.end = next->u.u_s.string + len; + next->prev = isp; + isp = next; + ret = isp->u.u_s.string; /* for immediate use only */ + input_change = true; + } + else + obstack_free (current_input, next); /* people might leave garbage on it. */ + next = NULL; + return ret; +} + +/*------------------------------------------------------------------. +| The function push_wrapup () pushes a string on the wrapup stack. | +| When the normal input stack gets empty, the wrapup stack will | +| become the input stack, and push_string () and push_file () will | +| operate on wrapup_stack. Push_wrapup should be done as | +| push_string (), but this will suffice, as long as arguments to | +| m4_m4wrap () are moderate in size. | +`------------------------------------------------------------------*/ + +void +push_wrapup (const char *s) +{ + size_t len = strlen (s); + input_block *i; + i = (input_block *) obstack_alloc (wrapup_stack, + sizeof (struct input_block)); + i->prev = wsp; + i->type = INPUT_STRING; + i->file = current_file; + i->line = current_line; + i->u.u_s.string = (char *) obstack_copy0 (wrapup_stack, s, len); + i->u.u_s.end = i->u.u_s.string + len; + wsp = i; +} + + +/*-------------------------------------------------------------------. +| The function pop_input () pops one level of input sources. If the | +| popped input_block is a file, current_file and current_line are | +| reset to the saved values before the memory for the input_block is | +| released. | +`-------------------------------------------------------------------*/ + +static void +pop_input (void) +{ + input_block *tmp = isp->prev; + + switch (isp->type) + { + case INPUT_STRING: + case INPUT_MACRO: + break; + + case INPUT_FILE: + if (debug_level & DEBUG_TRACE_INPUT) + { + if (tmp) + DEBUG_MESSAGE2 ("input reverted to %s, line %d", + tmp->file, tmp->line); + else + DEBUG_MESSAGE ("input exhausted"); + } + + if (ferror (isp->u.u_f.fp)) + { + M4ERROR ((warning_status, 0, "read error")); + if (isp->u.u_f.close) + fclose (isp->u.u_f.fp); + retcode = EXIT_FAILURE; + } + else if (isp->u.u_f.close && fclose (isp->u.u_f.fp) == EOF) + { + M4ERROR ((warning_status, errno, "error reading file")); + retcode = EXIT_FAILURE; + } + start_of_input_line = isp->u.u_f.advance; + output_current_line = -1; + break; + + default: + M4ERROR ((warning_status, 0, + "INTERNAL ERROR: input stack botch in pop_input ()")); + abort (); + } + obstack_free (current_input, isp); + next = NULL; /* might be set in push_string_init () */ + + isp = tmp; + input_change = true; +} + +/*-------------------------------------------------------------------. +| To switch input over to the wrapup stack, main calls pop_wrapup | +| (). Since wrapup text can install new wrapup text, pop_wrapup () | +| returns false when there is no wrapup text on the stack, and true | +| otherwise. | +`-------------------------------------------------------------------*/ + +bool +pop_wrapup (void) +{ + next = NULL; + obstack_free (current_input, NULL); + free (current_input); + + if (wsp == NULL) + { + /* End of the program. Free all memory even though we are about + to exit, since it makes leak detection easier. */ + obstack_free (&token_stack, NULL); + obstack_free (&file_names, NULL); + obstack_free (wrapup_stack, NULL); + free (wrapup_stack); +#ifdef ENABLE_CHANGEWORD + regfree (&word_regexp); +#endif /* ENABLE_CHANGEWORD */ + return false; + } + + current_input = wrapup_stack; + wrapup_stack = (struct obstack *) xmalloc (sizeof (struct obstack)); + obstack_init (wrapup_stack); + + isp = wsp; + wsp = NULL; + input_change = true; + + return true; +} + +/*-------------------------------------------------------------------. +| When a MACRO token is seen, next_token () uses init_macro_token () | +| to retrieve the value of the function pointer. | +`-------------------------------------------------------------------*/ + +static void +init_macro_token (token_data *td) +{ + if (isp->type != INPUT_MACRO) + { + M4ERROR ((warning_status, 0, + "INTERNAL ERROR: bad call to init_macro_token ()")); + abort (); + } + + TOKEN_DATA_TYPE (td) = TOKEN_FUNC; + TOKEN_DATA_FUNC (td) = isp->u.func; +} + + +/*-----------------------------------------------------------------. +| Low level input is done a character at a time. The function | +| peek_input () is used to look at the next character in the input | +| stream. At any given time, it reads from the input_block on the | +| top of the current input stack. | +`-----------------------------------------------------------------*/ + +static int +peek_input (void) +{ + int ch; + input_block *block = isp; + + while (1) + { + if (block == NULL) + return CHAR_EOF; + + switch (block->type) + { + case INPUT_STRING: + ch = to_uchar (block->u.u_s.string[0]); + if (ch != '\0') + return ch; + break; + + case INPUT_FILE: + ch = getc (block->u.u_f.fp); + if (ch != EOF) + { + ungetc (ch, block->u.u_f.fp); + return ch; + } + block->u.u_f.end = true; + break; + + case INPUT_MACRO: + return CHAR_MACRO; + + default: + M4ERROR ((warning_status, 0, + "INTERNAL ERROR: input stack botch in peek_input ()")); + abort (); + } + block = block->prev; + } +} + +/*-------------------------------------------------------------------. +| The function next_char () is used to read and advance the input to | +| the next character. It also manages line numbers for error | +| messages, so they do not get wrong, due to lookahead. The token | +| consisting of a newline alone is taken as belonging to the line it | +| ends, and the current line number is not incremented until the | +| next character is read. 99.9% of all calls will read from a | +| string, so factor that out into a macro for speed. | +`-------------------------------------------------------------------*/ + +#define next_char() \ + (isp && isp->type == INPUT_STRING && isp->u.u_s.string[0] \ + && !input_change \ + ? to_uchar (*isp->u.u_s.string++) \ + : next_char_1 ()) + +static int +next_char_1 (void) +{ + int ch; + + while (1) + { + if (isp == NULL) + { + current_file = ""; + current_line = 0; + return CHAR_EOF; + } + + if (input_change) + { + current_file = isp->file; + current_line = isp->line; + input_change = false; + } + + switch (isp->type) + { + case INPUT_STRING: + ch = to_uchar (*isp->u.u_s.string++); + if (ch != '\0') + return ch; + break; + + case INPUT_FILE: + if (start_of_input_line) + { + start_of_input_line = false; + current_line = ++isp->line; + } + + /* If stdin is a terminal, calling getc after peek_input + already called it would make the user have to hit ^D + twice to quit. */ + ch = isp->u.u_f.end ? EOF : getc (isp->u.u_f.fp); + if (ch != EOF) + { + if (ch == '\n') + start_of_input_line = true; + return ch; + } + break; + + case INPUT_MACRO: + pop_input (); /* INPUT_MACRO input sources has only one token */ + return CHAR_MACRO; + + default: + M4ERROR ((warning_status, 0, + "INTERNAL ERROR: input stack botch in next_char ()")); + abort (); + } + + /* End of input source --- pop one level. */ + pop_input (); + } +} + +/*-------------------------------------------------------------------. +| skip_line () simply discards all immediately following characters, | +| upto the first newline. It is only used from m4_dnl (). | +`-------------------------------------------------------------------*/ + +void +skip_line (void) +{ + int ch; + const char *file = current_file; + int line = current_line; + + while ((ch = next_char ()) != CHAR_EOF && ch != '\n') + ; + if (ch == CHAR_EOF) + /* current_file changed to "" if we see CHAR_EOF, use the + previous value we stored earlier. */ + M4ERROR_AT_LINE ((warning_status, 0, file, line, + "Warning: end of file treated as newline")); + /* On the rare occasion that dnl crosses include file boundaries + (either the input file did not end in a newline, or changeword + was used), calling next_char can update current_file and + current_line, and that update will be undone as we return to + expand_macro. This informs next_char to fix things again. */ + if (file != current_file || line != current_line) + input_change = true; +} + + +/*------------------------------------------------------------------. +| This function is for matching a string against a prefix of the | +| input stream. If the string matches the input and consume is | +| true, the input is discarded; otherwise any characters read are | +| pushed back again. The function is used only when multicharacter | +| quotes or comment delimiters are used. | +`------------------------------------------------------------------*/ + +static bool +match_input (const char *s, bool consume) +{ + int n; /* number of characters matched */ + int ch; /* input character */ + const char *t; + bool result = false; + + ch = peek_input (); + if (ch != to_uchar (*s)) + return false; /* fail */ + + if (s[1] == '\0') + { + if (consume) + next_char (); + return true; /* short match */ + } + + next_char (); + for (n = 1, t = s++; peek_input () == to_uchar (*s++); ) + { + next_char (); + n++; + if (*s == '\0') /* long match */ + { + if (consume) + return true; + result = true; + break; + } + } + + /* Failed or shouldn't consume, push back input. */ + { + struct obstack *h = push_string_init (); + + /* `obstack_grow' may be macro evaluating its arg 1 several times. */ + obstack_grow (h, t, n); + } + push_string_finish (); + return result; +} + +/*--------------------------------------------------------------------. +| The macro MATCH() is used to match a string S against the input. | +| The first character is handled inline, for speed. Hopefully, this | +| will not hurt efficiency too much when single character quotes and | +| comment delimiters are used. If CONSUME, then CH is the result of | +| next_char, and a successful match will discard the matched string. | +| Otherwise, CH is the result of peek_char, and the input stream is | +| effectively unchanged. | +`--------------------------------------------------------------------*/ + +#define MATCH(ch, s, consume) \ + (to_uchar ((s)[0]) == (ch) \ + && (ch) != '\0' \ + && ((s)[1] == '\0' || (match_input ((s) + (consume), consume)))) + + +/*--------------------------------------------------------. +| Initialize input stacks, and quote/comment characters. | +`--------------------------------------------------------*/ + +void +input_init (void) +{ + current_file = ""; + current_line = 0; + + current_input = (struct obstack *) xmalloc (sizeof (struct obstack)); + obstack_init (current_input); + wrapup_stack = (struct obstack *) xmalloc (sizeof (struct obstack)); + obstack_init (wrapup_stack); + + obstack_init (&file_names); + + /* Allocate an object in the current chunk, so that obstack_free + will always work even if the first token parsed spills to a new + chunk. */ + obstack_init (&token_stack); + obstack_alloc (&token_stack, 1); + token_bottom = obstack_base (&token_stack); + + isp = NULL; + wsp = NULL; + next = NULL; + + start_of_input_line = false; + + lquote.string = xstrdup (DEF_LQUOTE); + lquote.length = strlen (lquote.string); + rquote.string = xstrdup (DEF_RQUOTE); + rquote.length = strlen (rquote.string); + bcomm.string = xstrdup (DEF_BCOMM); + bcomm.length = strlen (bcomm.string); + ecomm.string = xstrdup (DEF_ECOMM); + ecomm.length = strlen (ecomm.string); + +#ifdef ENABLE_CHANGEWORD + set_word_regexp (user_word_regexp); +#endif +} + + +/*------------------------------------------------------------------. +| Functions for setting quotes and comment delimiters. Used by | +| m4_changecom () and m4_changequote (). Pass NULL if the argument | +| was not present, to distinguish from an explicit empty string. | +`------------------------------------------------------------------*/ + +void +set_quotes (const char *lq, const char *rq) +{ + free (lquote.string); + free (rquote.string); + + /* POSIX states that with 0 arguments, the default quotes are used. + POSIX XCU ERN 112 states that behavior is implementation-defined + if there was only one argument, or if there is an empty string in + either position when there are two arguments. We allow an empty + left quote to disable quoting, but a non-empty left quote will + always create a non-empty right quote. See the texinfo for what + some other implementations do. */ + if (!lq) + { + lq = DEF_LQUOTE; + rq = DEF_RQUOTE; + } + else if (!rq || (*lq && !*rq)) + rq = DEF_RQUOTE; + + lquote.string = xstrdup (lq); + lquote.length = strlen (lquote.string); + rquote.string = xstrdup (rq); + rquote.length = strlen (rquote.string); +} + +void +set_comment (const char *bc, const char *ec) +{ + free (bcomm.string); + free (ecomm.string); + + /* POSIX requires no arguments to disable comments. It requires + empty arguments to be used as-is, but this is counter to + traditional behavior, because a non-null begin and null end makes + it impossible to end a comment. An aardvark has been filed: + http://www.opengroup.org/austin/mailarchives/ag-review/msg02168.html + This implementation assumes the aardvark will be approved. See + the texinfo for what some other implementations do. */ + if (!bc) + bc = ec = ""; + else if (!ec || (*bc && !*ec)) + ec = DEF_ECOMM; + + bcomm.string = xstrdup (bc); + bcomm.length = strlen (bcomm.string); + ecomm.string = xstrdup (ec); + ecomm.length = strlen (ecomm.string); +} + +#ifdef ENABLE_CHANGEWORD + +void +set_word_regexp (const char *regexp) +{ + const char *msg; + struct re_pattern_buffer new_word_regexp; + + if (!*regexp || STREQ (regexp, DEFAULT_WORD_REGEXP)) + { + default_word_regexp = true; + return; + } + + /* Dry run to see whether the new expression is compilable. */ + init_pattern_buffer (&new_word_regexp, NULL); + msg = re_compile_pattern (regexp, strlen (regexp), &new_word_regexp); + regfree (&new_word_regexp); + + if (msg != NULL) + { + M4ERROR ((warning_status, 0, + "bad regular expression `%s': %s", regexp, msg)); + return; + } + + /* If compilation worked, retry using the word_regexp struct. We + can't rely on struct assigns working, so redo the compilation. + The fastmap can be reused between compilations, and will be freed + by the final regfree. */ + if (!word_regexp.fastmap) + word_regexp.fastmap = xcharalloc (UCHAR_MAX + 1); + msg = re_compile_pattern (regexp, strlen (regexp), &word_regexp); + assert (!msg); + re_set_registers (&word_regexp, ®s, regs.num_regs, regs.start, regs.end); + if (re_compile_fastmap (&word_regexp)) + assert (false); + + default_word_regexp = false; +} + +#endif /* ENABLE_CHANGEWORD */ + + +/*--------------------------------------------------------------------. +| Parse and return a single token from the input stream. A token | +| can either be TOKEN_EOF, if the input_stack is empty; it can be | +| TOKEN_STRING for a quoted string; TOKEN_WORD for something that is | +| a potential macro name; and TOKEN_SIMPLE for any single character | +| that is not a part of any of the previous types. If LINE is not | +| NULL, set *LINE to the line where the token starts. | +| | +| Next_token () return the token type, and passes back a pointer to | +| the token data through TD. The token text is collected on the | +| obstack token_stack, which never contains more than one token text | +| at a time. The storage pointed to by the fields in TD is | +| therefore subject to change the next time next_token () is called. | +`--------------------------------------------------------------------*/ + +token_type +next_token (token_data *td, int *line) +{ + int ch; + int quote_level; + token_type type; +#ifdef ENABLE_CHANGEWORD + int startpos; + char *orig_text = NULL; +#endif + const char *file; + int dummy; + + obstack_free (&token_stack, token_bottom); + if (!line) + line = &dummy; + + /* Can't consume character until after CHAR_MACRO is handled. */ + ch = peek_input (); + if (ch == CHAR_EOF) + { +#ifdef DEBUG_INPUT + xfprintf (stderr, "next_token -> EOF\n"); +#endif + next_char (); + return TOKEN_EOF; + } + if (ch == CHAR_MACRO) + { + init_macro_token (td); + next_char (); +#ifdef DEBUG_INPUT + xfprintf (stderr, "next_token -> MACDEF (%s)\n", + find_builtin_by_addr (TOKEN_DATA_FUNC (td))->name); +#endif + return TOKEN_MACDEF; + } + + next_char (); /* Consume character we already peeked at. */ + file = current_file; + *line = current_line; + if (MATCH (ch, bcomm.string, true)) + { + obstack_grow (&token_stack, bcomm.string, bcomm.length); + while ((ch = next_char ()) != CHAR_EOF + && !MATCH (ch, ecomm.string, true)) + obstack_1grow (&token_stack, ch); + if (ch != CHAR_EOF) + obstack_grow (&token_stack, ecomm.string, ecomm.length); + else + /* current_file changed to "" if we see CHAR_EOF, use the + previous value we stored earlier. */ + M4ERROR_AT_LINE ((EXIT_FAILURE, 0, file, *line, + "ERROR: end of file in comment")); + + type = TOKEN_STRING; + } + else if (default_word_regexp && (isalpha (ch) || ch == '_')) + { + obstack_1grow (&token_stack, ch); + while ((ch = peek_input ()) != CHAR_EOF && (isalnum (ch) || ch == '_')) + { + obstack_1grow (&token_stack, ch); + next_char (); + } + type = TOKEN_WORD; + } + +#ifdef ENABLE_CHANGEWORD + + else if (!default_word_regexp && word_regexp.fastmap[ch]) + { + obstack_1grow (&token_stack, ch); + while (1) + { + ch = peek_input (); + if (ch == CHAR_EOF) + break; + obstack_1grow (&token_stack, ch); + startpos = re_search (&word_regexp, + (char *) obstack_base (&token_stack), + obstack_object_size (&token_stack), 0, 0, + ®s); + if (startpos || + regs.end [0] != (regoff_t) obstack_object_size (&token_stack)) + { + *(((char *) obstack_base (&token_stack) + + obstack_object_size (&token_stack)) - 1) = '\0'; + break; + } + next_char (); + } + + obstack_1grow (&token_stack, '\0'); + orig_text = (char *) obstack_finish (&token_stack); + + if (regs.start[1] != -1) + obstack_grow (&token_stack,orig_text + regs.start[1], + regs.end[1] - regs.start[1]); + else + obstack_grow (&token_stack, orig_text,regs.end[0]); + + type = TOKEN_WORD; + } + +#endif /* ENABLE_CHANGEWORD */ + + else if (!MATCH (ch, lquote.string, true)) + { + switch (ch) + { + case '(': + type = TOKEN_OPEN; + break; + case ',': + type = TOKEN_COMMA; + break; + case ')': + type = TOKEN_CLOSE; + break; + default: + type = TOKEN_SIMPLE; + break; + } + obstack_1grow (&token_stack, ch); + } + else + { + bool fast = lquote.length == 1 && rquote.length == 1; + quote_level = 1; + while (1) + { + /* Try scanning a buffer first. */ + const char *buffer = (isp && isp->type == INPUT_STRING + ? isp->u.u_s.string : NULL); + if (buffer && *buffer) + { + size_t len = isp->u.u_s.end - buffer; + const char *p = buffer; + do + { + p = (char *) memchr2 (p, *lquote.string, *rquote.string, + buffer + len - p); + } + while (p && fast && (*p++ == *rquote.string + ? --quote_level : ++quote_level)); + if (p) + { + if (fast) + { + assert (!quote_level); + obstack_grow (&token_stack, buffer, p - buffer - 1); + isp->u.u_s.string += p - buffer; + break; + } + obstack_grow (&token_stack, buffer, p - buffer); + ch = to_uchar (*p); + isp->u.u_s.string += p - buffer + 1; + } + else + { + obstack_grow (&token_stack, buffer, len); + isp->u.u_s.string += len; + continue; + } + } + /* Fall back to a byte. */ + else + ch = next_char (); + if (ch == CHAR_EOF) + /* current_file changed to "" if we see CHAR_EOF, use + the previous value we stored earlier. */ + M4ERROR_AT_LINE ((EXIT_FAILURE, 0, file, *line, + "ERROR: end of file in string")); + + if (MATCH (ch, rquote.string, true)) + { + if (--quote_level == 0) + break; + obstack_grow (&token_stack, rquote.string, rquote.length); + } + else if (MATCH (ch, lquote.string, true)) + { + quote_level++; + obstack_grow (&token_stack, lquote.string, lquote.length); + } + else + obstack_1grow (&token_stack, ch); + } + type = TOKEN_STRING; + } + + obstack_1grow (&token_stack, '\0'); + + TOKEN_DATA_TYPE (td) = TOKEN_TEXT; + TOKEN_DATA_TEXT (td) = (char *) obstack_finish (&token_stack); +#ifdef ENABLE_CHANGEWORD + if (orig_text == NULL) + orig_text = TOKEN_DATA_TEXT (td); + TOKEN_DATA_ORIG_TEXT (td) = orig_text; +#endif +#ifdef DEBUG_INPUT + xfprintf (stderr, "next_token -> %s (%s)\n", + token_type_string (type), TOKEN_DATA_TEXT (td)); +#endif + return type; +} + +/*-----------------------------------------------. +| Peek at the next token from the input stream. | +`-----------------------------------------------*/ + +token_type +peek_token (void) +{ + token_type result; + int ch = peek_input (); + + if (ch == CHAR_EOF) + { + result = TOKEN_EOF; + } + else if (ch == CHAR_MACRO) + { + result = TOKEN_MACDEF; + } + else if (MATCH (ch, bcomm.string, false)) + { + result = TOKEN_STRING; + } + else if ((default_word_regexp && (isalpha (ch) || ch == '_')) +#ifdef ENABLE_CHANGEWORD + || (! default_word_regexp && word_regexp.fastmap[ch]) +#endif /* ENABLE_CHANGEWORD */ + ) + { + result = TOKEN_WORD; + } + else if (MATCH (ch, lquote.string, false)) + { + result = TOKEN_STRING; + } + else + switch (ch) + { + case '(': + result = TOKEN_OPEN; + break; + case ',': + result = TOKEN_COMMA; + break; + case ')': + result = TOKEN_CLOSE; + break; + default: + result = TOKEN_SIMPLE; + } + +#ifdef DEBUG_INPUT + xfprintf (stderr, "peek_token -> %s\n", token_type_string (result)); +#endif /* DEBUG_INPUT */ + return result; +} + + +#ifdef DEBUG_INPUT + +static const char * +token_type_string (token_type t) +{ + switch (t) + { /* TOKSW */ + case TOKEN_EOF: + return "EOF"; + case TOKEN_STRING: + return "STRING"; + case TOKEN_WORD: + return "WORD"; + case TOKEN_OPEN: + return "OPEN"; + case TOKEN_COMMA: + return "COMMA"; + case TOKEN_CLOSE: + return "CLOSE"; + case TOKEN_SIMPLE: + return "SIMPLE"; + case TOKEN_MACDEF: + return "MACDEF"; + default: + abort (); + } + } + +static void +print_token (const char *s, token_type t, token_data *td) +{ + xfprintf (stderr, "%s: ", s); + switch (t) + { /* TOKSW */ + case TOKEN_OPEN: + case TOKEN_COMMA: + case TOKEN_CLOSE: + case TOKEN_SIMPLE: + xfprintf (stderr, "char:"); + break; + + case TOKEN_WORD: + xfprintf (stderr, "word:"); + break; + + case TOKEN_STRING: + xfprintf (stderr, "string:"); + break; + + case TOKEN_MACDEF: + xfprintf (stderr, "macro: %p\n", TOKEN_DATA_FUNC (td)); + break; + + case TOKEN_EOF: + xfprintf (stderr, "eof\n"); + break; + } + xfprintf (stderr, "\t\"%s\"\n", TOKEN_DATA_TEXT (td)); +} + +static void M4_GNUC_UNUSED +lex_debug (void) +{ + token_type t; + token_data td; + + while ((t = next_token (&td, NULL)) != TOKEN_EOF) + print_token ("lex", t, &td); +} +#endif /* DEBUG_INPUT */ |