gcc 源码阅读--C语言预处理
在c/c++语言中,编译器第一项处理的就是做预处理,
比如#include ,#ifdef #endif #if 等等
GCC实现这部分代码放在一个单独的目录中,libcpp
这里面有几个重要结构需要了解:
struct GTY(()) cpp_token {
/* Location of first char of token, together with range of full token. */
location_t src_loc;
// 记录词法元素中第一个字符的源码位置
ENUM_BITFIELD(cpp_ttype) type : CHAR_BIT; /* token type */
//对应的类型,CPP_NUMBER
unsigned short flags; /* flags - see above */
union cpp_token_u
{
// 代表的是为各个词法元素最终建立的值节点,不同的词法元素使用不同的结构体。
/* An identifier. */
struct cpp_identifier GTY ((tag ("CPP_TOKEN_FLD_NODE"))) node;
/* Inherit padding from this token. */
cpp_token * GTY ((tag ("CPP_TOKEN_FLD_SOURCE"))) source;
/* A string, or number. */
struct cpp_string GTY ((tag ("CPP_TOKEN_FLD_STR"))) str;
/* Argument no. (and original spelling) for a CPP_MACRO_ARG. */
struct cpp_macro_arg GTY ((tag ("CPP_TOKEN_FLD_ARG_NO"))) macro_arg;
/* Original token no. for a CPP_PASTE (from a sequence of
consecutive paste tokens in a macro expansion). */
unsigned int GTY ((tag ("CPP_TOKEN_FLD_TOKEN_NO"))) token_no;
/* Caller-supplied identifier for a CPP_PRAGMA. */
unsigned int GTY ((tag ("CPP_TOKEN_FLD_PRAGMA"))) pragma;
} GTY ((desc ("cpp_token_val_index (&%1)"))) val;
};
struct GTY (()) c_token {
/* The kind of token. */
ENUM_BITFIELD (cpp_ttype) type : 8;
/* If this token is a CPP_NAME, this value indicates whether also
declared as some kind of type. Otherwise, it is C_ID_NONE. */
ENUM_BITFIELD (c_id_kind) id_kind : 8;//关键字标识符
/* If this token is a keyword, this value indicates which keyword.
Otherwise, this value is RID_MAX. */
ENUM_BITFIELD (rid) keyword : 8;
/* If this token is a CPP_PRAGMA, this indicates the pragma that
was seen. Otherwise it is PRAGMA_NONE. */
ENUM_BITFIELD (pragma_kind) pragma_kind : 8;
/* The location at which this token was found. */
location_t location;
/* The value associated with this token, if any. */
tree value;
/* Token flags. */
unsigned char flags;
source_range get_range () const
{
return get_range_from_loc (line_table, location);
}
location_t get_finish () const
{
return get_range ().m_finish;
}
};
struct c_parser {
c_token * tokens; /* 当前正在处理的语法符号c_token的地址,这里除了初始化时,应该指向 tokens_buf[0] */
c_token tokens_buf[4]; /* c_token预读缓存,按照gcc的语法分析原理,预读不会超过4个语法符号 */
unsigned int tokens_avail; /* tokens_buf中可用的预读词法符号的数目 */
。。。。。。
}
struct cpp_reader
{
/* Top of buffer stack. */
cpp_buffer *buffer;
/* Overlaid buffer (can be different after processing #include). */
cpp_buffer *overlaid_buffer;
/* Lexer state. */
struct lexer_state state;
/* Source line tracking. */
class line_maps *line_table;
/* The line of the '#' of the current directive. */
location_t directive_line;
/* Memory buffers. */
_cpp_buff *a_buff; /* Aligned permanent storage. */
_cpp_buff *u_buff; /* Unaligned permanent storage. */
_cpp_buff *free_buffs; /* Free buffer chain. */
/* Context stack. */
struct cpp_context base_context;
struct cpp_context *context;
/* If in_directive, the directive if known. */
const struct directive *directive;
/* Token generated while handling a directive, if any. */
cpp_token directive_result;
/* When expanding a macro at top-level, this is the location of the
macro invocation. */
location_t invocation_location;
/* This is the node representing the macro being expanded at
top-level. The value of this data member is valid iff
cpp_in_macro_expansion_p() returns TRUE. */
cpp_hashnode *top_most_macro_node;
/* Nonzero if we are about to expand a macro. Note that if we are
really expanding a macro, the function macro_of_context returns
the macro being expanded and this flag is set to false. Client
code should use the function cpp_in_macro_expansion_p to know if we
are either about to expand a macro, or are actually expanding
one. */
bool about_to_expand_macro_p;
/* True if the preprocessor should diagnose CPP_DOT or CPP_COLON
tokens as the first ones coming from macro expansion. */
bool diagnose_dot_colon_from_macro_p;
/* Search paths for include files. */
struct cpp_dir *quote_include; /* "" */
struct cpp_dir *bracket_include; /* <> */
struct cpp_dir no_search_path; /* No path. */
struct cpp_dir *embed_include; /* #embed <> */
/* Chain of all hashed _cpp_file instances. */
struct _cpp_file *all_files;
struct _cpp_file *main_file;
/* File and directory hash table. */
struct htab *file_hash;
struct htab *dir_hash;
struct file_hash_entry_pool *file_hash_entries;
/* Negative path lookup hash table. */
struct htab *nonexistent_file_hash;
struct obstack nonexistent_file_ob;
/* Nonzero means don't look for #include "foo" the source-file
directory. */
bool quote_ignores_source_dir;
/* Nonzero if any file has contained #pragma once or #import has
been used. */
bool seen_once_only;
/* Multiple include optimization and -Wheader-guard warning. */
const cpp_hashnode *mi_cmacro;
const cpp_hashnode *mi_ind_cmacro;
const cpp_hashnode *mi_def_cmacro;
location_t mi_loc, mi_def_loc;
bool mi_valid;
/* Lexing. */
cpp_token *cur_token;
tokenrun base_run, *cur_run;
unsigned int lookaheads;
/* Nonzero prevents the lexer from re-using the token runs. */
unsigned int keep_tokens;
/* Buffer to hold macro definition string. */
unsigned char *macro_buffer;
unsigned int macro_buffer_len;
/* Descriptor for converting from the source character set to the
execution character set. */
struct cset_converter narrow_cset_desc;
/* Descriptor for converting from the source character set to the
UTF-8 execution character set. */
struct cset_converter utf8_cset_desc;
/* Descriptor for converting from the source character set to the
UTF-16 execution character set. */
struct cset_converter char16_cset_desc;
/* Descriptor for converting from the source character set to the
UTF-32 execution character set. */
struct cset_converter char32_cset_desc;
/* Descriptor for converting from the source character set to the
wide execution character set. */
struct cset_converter wide_cset_desc;
/* Date and time text. Calculated together if either is requested. */
const unsigned char *date;
const unsigned char *time;
/* Time stamp, set idempotently lazily. */
time_t time_stamp;
int time_stamp_kind; /* Or errno. */
/* A token forcing paste avoidance, and one demarking macro arguments. */
cpp_token avoid_paste;
cpp_token endarg;
/* Opaque handle to the dependencies of mkdeps.cc. */
class mkdeps *deps;
/* Obstack holding all macro hash nodes. This never shrinks.
See identifiers.cc */
struct obstack hash_ob;
/* Obstack holding buffer and conditional structures. This is a
real stack. See directives.cc. */
struct obstack buffer_ob;
/* Pragma table - dynamic, because a library user can add to the
list of recognized pragmas. */
struct pragma_entry *pragmas;
/* Call backs to cpplib client. */
struct cpp_callbacks cb;
/* Identifier hash table. */
struct ht *hash_table;
/* Identifier ancillary data hash table. */
struct ht *extra_hash_table;
/* Expression parser stack. */
struct op *op_stack, *op_limit;
/* User visible options. */
struct cpp_options opts;
/* Special nodes - identifiers with predefined significance to the
preprocessor. */
struct spec_nodes spec_nodes;
/* Whether cpplib owns the hashtable. */
bool our_hashtable, our_extra_hashtable;
/* Traditional preprocessing output buffer (a logical line). */
struct
{
unsigned char *base;
unsigned char *limit;
unsigned char *cur;
location_t first_line;
} out;
/* Used for buffer overlays by traditional.cc. */
const unsigned char *saved_cur, *saved_rlimit, *saved_line_base;
/* A saved list of the defined macros, for dependency checking
of precompiled headers. */
struct cpp_savedstate *savedstate;
/* Next value of __COUNTER__ macro. */
unsigned int counter;
/* Table of comments, when state.save_comments is true. */
cpp_comment_table comments;
/* List of saved macros by push_macro. */
struct def_pragma_macro *pushed_macros;
/* If non-zero, the lexer will use this location for the next token
instead of getting a location from the linemap. */
location_t forced_token_location;
/* Location identifying the main source file -- intended to be line
zero of said file. */
location_t main_loc;
/* If non-zero, override diagnostic locations (other than DK_NOTE
diagnostics) to this one. */
location_t diagnostic_override_loc;
/* Returns true iff we should warn about UTF-8 bidirectional control
characters. */
bool warn_bidi_p () const
{
return (CPP_OPTION (this, cpp_warn_bidirectional)
& (bidirectional_unpaired|bidirectional_any));
}
}
/* An identifier hash table for cpplib and the front ends. */
struct ht
{
/* Identifiers are allocated from here. */
struct obstack stack; //负责此hash表中的内存分配
/*指向一个hashnode[nslots]数组的首地址,这个数组就是所谓的hash桶,数组中的每个元素都记录了一个具体元素的指针(所以每个元素叫做一个hashnode)
而hashnode具体的元素则是一个 ht_identifer,其只能代表一个字符串的内容,长度和hash.
此hash桶是自动扩展的,在ht搜索函数ht_lookup_with_hash中,若发现整个hash table超过3/4都满了,就会主动扩展此hash table(重新分配,复制,释放原有的)。*/
hashnode *entries;
/* Call back, allocate a node. */
/*
整个gcc源码中有两个alloc_node函数,一个定义在./gcc/stringpool.c中,一个定义在libcpp/identifiers.c中(libcpp这个目录是负责预处理和词法分析的).
* 对于cc1来说,其有自己的alloc_node函数,调用的总是 gcc/stringpools.c:alloc_node
* 而对于使用libcpp.a的其他程序,如果自己没有实现alloc_node函数,那么会默认使用./libcpp/identifier.c:alloc_node函数
alloc_node函数是用来分配节点内存的,分配后hashnode[]数组中的指针,也就指向这个内存中的元素,在ht搜索过程中(ht_lookup_with_hash),若发现需要新
插入一个元素,则就会调用alloc_node来分配内存,最终其返回值会被记录到hashnode[]中。
注: alloc_node可以为节点分配任意类型的结构体,只要最终返回此结构体中的一个ht_identifer结构体即可(./gcc/stringpool.c真正分配的是一个 lang_identifier树节点
*/
hashnode (*alloc_node) (cpp_hash_table *);
/* Call back, allocate something that hangs off a node like a cpp_macro.
NULL means use the usual allocator. */
void * (*alloc_subobject) (size_t);
unsigned int nslots; // hash中总共能存多少个指针
unsigned int nelements; /* Number of live elements. */
/* Link to reader, if any. For the benefit of cpplib. */
struct cpp_reader *pfile; //指向对应的cpp_reader(即parse_in)
/* Table usage statistics. */
unsigned int searches;
unsigned int collisions;
/* Should 'entries' be freed when it is no longer needed? */
bool entries_owned;
};
实现代码在:
toplev::main
=>lang_hooks.init_options = c_common_init_options
=> parse_in = cpp_create_reader() //1) 这里主要是对全局变量parse_in的初始化
=> do_compile
=> process_options
=> lang_hooks.post_options = c_common_post_options
=> cpp_read_main_file (parse_in, in_fnames[0]) //2) 这里主要负责打开并读入编译单元文件
=> compile_file();
=> lang_hooks.parse_file = c_common_parse_file()
=> c_parse_file ();
在这个函数中会初始化模块需要用到的几个参数:
init_stringpool (void)
{
/* Clean up if we're called more than once.
(We can't make this idempotent since identifiers contain state) */
if (ident_hash)
ht_destroy (ident_hash);
if (ident_hash_extra)
ht_destroy (ident_hash_extra);
/* Create with 16K (2^14) entries. */
ident_hash = ht_create (14);
ident_hash->alloc_node = alloc_node;
ident_hash->alloc_subobject = stringpool_ggc_alloc;
/* Create with 64 (2^6) entries. */
ident_hash_extra = ht_create (6);
ident_hash_extra->alloc_node = [] (cpp_hash_table *)
{
return HT_NODE (ggc_cleared_alloc<cpp_hashnode_extra> ());
};
ident_hash_extra->alloc_subobject = stringpool_ggc_alloc;
}
general_init (const char *argv0, bool init_signals, unique_argv original_argv)
{
.....
line_table = ggc_alloc<line_maps> ();
linemap_init (line_table, BUILTINS_LOCATION);
line_table->m_reallocator = realloc_for_line_map;
line_table->m_round_alloc_size = ggc_round_alloc_size;
line_table->default_range_bits = line_map_suggested_range_bits;
......
}
c_common_init_options (unsigned int decoded_options_count,
struct cl_decoded_option *decoded_options)
{
unsigned int i;
struct cpp_callbacks *cb;
g_string_concat_db
= new (ggc_alloc <string_concat_db> ()) string_concat_db ();
parse_in = cpp_create_reader (c_dialect_cxx () ? CLK_GNUCXX : CLK_GNUC89,
ident_hash, line_table, ident_hash_extra);
cb = cpp_get_callbacks (parse_in);
cb->diagnostic = c_cpp_diagnostic;
.....
}
/* Post-switch processing. */
bool
c_common_post_options (const char **pfilename)
{
...
*pfilename = this_input_filename
= cpp_read_main_file (parse_in, in_fnames[0],
/* We'll inject preamble pieces if this is
not preprocessed. */
!cpp_opts->preprocessed);
...
}
c_common_parse_file (void)
{
auto dumps = g->get_dumps ();
for (unsigned int i = 0;;)
{
c_finish_options ();
/* Open the dump file to use for the original dump output
here, to be used during parsing for the current file. */
dumps->dump_start (TDI_original, &dump_flags);
pch_init ();
push_file_scope ();
c_parse_file ();
pop_file_scope ();
/* And end the main input file, if the debug writer wants it */
if (debug_hooks->start_end_main_source_file)
(*debug_hooks->end_source_file) (0);
if (++i >= num_in_fnames)
break;
cpp_undef_all (parse_in);
cpp_clear_file_cache (parse_in);
this_input_filename
= cpp_read_main_file (parse_in, in_fnames[i]);
/* If an input file is missing, abandon further compilation.
cpplib has issued a diagnostic. */
if (!this_input_filename)
break;
dumps->dump_finish (TDI_original);
}
c_parse_final_cleanups ();
dumps->dump_finish (TDI_original);
}
/* Wrapper around cpp_get_token to skip CPP_PADDING tokens
and not consume CPP_EOF. This does not perform the optional
streaming in preprocess_only mode, so is suitable to be used
when processing builtin expansions such as c_common_has_attribute. */
static const cpp_token *
get_token_no_padding (cpp_reader *pfile)
{
for (;;)
{
const cpp_token *ret = cpp_peek_token (pfile, 0);
if (ret->type == CPP_EOF)
return ret;
ret = cpp_get_token (pfile);
if (ret->type != CPP_PADDING)
return ret;
}
}