当前位置：首页 > news >正文

gcc 源码阅读--C语言预处理

news 2025/7/14 14:54:00

在c/c++语言中，编译器第一项处理的就是做预处理，

比如#include ,#ifdef #endif #if 等等

GCC实现这部分代码放在一个单独的目录中,libcpp

这里面有几个重要结构需要了解：

struct GTY(()) cpp_token {

/* Location of first char of token, together with range of full token. */
location_t src_loc;
// 记录词法元素中第一个字符的源码位置

ENUM_BITFIELD(cpp_ttype) type : CHAR_BIT; /* token type */
//对应的类型，CPP_NUMBER
unsigned short flags; /* flags - see above */

union cpp_token_u
{
// 代表的是为各个词法元素最终建立的值节点，不同的词法元素使用不同的结构体。
/* An identifier. */
struct cpp_identifier GTY ((tag ("CPP_TOKEN_FLD_NODE"))) node;

/* Inherit padding from this token. */
cpp_token * GTY ((tag ("CPP_TOKEN_FLD_SOURCE"))) source;

/* A string, or number. */
struct cpp_string GTY ((tag ("CPP_TOKEN_FLD_STR"))) str;

/* Argument no. (and original spelling) for a CPP_MACRO_ARG. */
struct cpp_macro_arg GTY ((tag ("CPP_TOKEN_FLD_ARG_NO"))) macro_arg;

/* Original token no. for a CPP_PASTE (from a sequence of
consecutive paste tokens in a macro expansion). */
unsigned int GTY ((tag ("CPP_TOKEN_FLD_TOKEN_NO"))) token_no;

/* Caller-supplied identifier for a CPP_PRAGMA. */
unsigned int GTY ((tag ("CPP_TOKEN_FLD_PRAGMA"))) pragma;
} GTY ((desc ("cpp_token_val_index (&%1)"))) val;
};
struct GTY (()) c_token {
/* The kind of token. */
ENUM_BITFIELD (cpp_ttype) type : 8;
/* If this token is a CPP_NAME, this value indicates whether also
declared as some kind of type. Otherwise, it is C_ID_NONE. */
ENUM_BITFIELD (c_id_kind) id_kind : 8;//关键字标识符
/* If this token is a keyword, this value indicates which keyword.
Otherwise, this value is RID_MAX. */
ENUM_BITFIELD (rid) keyword : 8;
/* If this token is a CPP_PRAGMA, this indicates the pragma that
was seen. Otherwise it is PRAGMA_NONE. */
ENUM_BITFIELD (pragma_kind) pragma_kind : 8;
/* The location at which this token was found. */
location_t location;
/* The value associated with this token, if any. */
tree value;
/* Token flags. */
unsigned char flags;

source_range get_range () const
{
return get_range_from_loc (line_table, location);
}

location_t get_finish () const
{
return get_range ().m_finish;
}
};

struct c_parser {

c_token * tokens; /* 当前正在处理的语法符号c_token的地址，这里除了初始化时，应该指向 tokens_buf[0] */

c_token tokens_buf[4]; /* c_token预读缓存，按照gcc的语法分析原理，预读不会超过4个语法符号 */

unsigned int tokens_avail; /* tokens_buf中可用的预读词法符号的数目 */
。。。。。。
}

struct cpp_reader
{
/* Top of buffer stack. */
cpp_buffer *buffer;

/* Overlaid buffer (can be different after processing #include). */
cpp_buffer *overlaid_buffer;

/* Lexer state. */
struct lexer_state state;

/* Source line tracking. */
class line_maps *line_table;

/* The line of the '#' of the current directive. */
location_t directive_line;

/* Memory buffers. */
_cpp_buff *a_buff;       /* Aligned permanent storage. */
_cpp_buff *u_buff;       /* Unaligned permanent storage. */
_cpp_buff *free_buffs;   /* Free buffer chain. */

/* Context stack. */
struct cpp_context base_context;
struct cpp_context *context;

/* If in_directive, the directive if known. */
const struct directive *directive;

/* Token generated while handling a directive, if any. */
cpp_token directive_result;

/* When expanding a macro at top-level, this is the location of the
macro invocation. */
location_t invocation_location;

/* This is the node representing the macro being expanded at
top-level. The value of this data member is valid iff
cpp_in_macro_expansion_p() returns TRUE. */
cpp_hashnode *top_most_macro_node;

/* Nonzero if we are about to expand a macro. Note that if we are
really expanding a macro, the function macro_of_context returns
the macro being expanded and this flag is set to false. Client
code should use the function cpp_in_macro_expansion_p to know if we
are either about to expand a macro, or are actually expanding
one. */
bool about_to_expand_macro_p;

/* True if the preprocessor should diagnose CPP_DOT or CPP_COLON
tokens as the first ones coming from macro expansion. */
bool diagnose_dot_colon_from_macro_p;

/* Search paths for include files. */
struct cpp_dir *quote_include;   /* "" */
struct cpp_dir *bracket_include;   /* <> */
struct cpp_dir no_search_path;   /* No path. */
struct cpp_dir *embed_include;   /* #embed <> */

/* Chain of all hashed _cpp_file instances. */
struct _cpp_file *all_files;

struct _cpp_file *main_file;

/* File and directory hash table. */
struct htab *file_hash;
struct htab *dir_hash;
struct file_hash_entry_pool *file_hash_entries;

/* Negative path lookup hash table. */
struct htab *nonexistent_file_hash;
struct obstack nonexistent_file_ob;

/* Nonzero means don't look for #include "foo" the source-file
directory. */
bool quote_ignores_source_dir;

/* Nonzero if any file has contained #pragma once or #import has
been used. */
bool seen_once_only;

/* Multiple include optimization and -Wheader-guard warning. */
const cpp_hashnode *mi_cmacro;
const cpp_hashnode *mi_ind_cmacro;
const cpp_hashnode *mi_def_cmacro;
location_t mi_loc, mi_def_loc;
bool mi_valid;

/* Lexing. */
cpp_token *cur_token;
tokenrun base_run, *cur_run;
unsigned int lookaheads;

/* Nonzero prevents the lexer from re-using the token runs. */
unsigned int keep_tokens;

/* Buffer to hold macro definition string. */
unsigned char *macro_buffer;
unsigned int macro_buffer_len;

/* Descriptor for converting from the source character set to the
execution character set. */
struct cset_converter narrow_cset_desc;

/* Descriptor for converting from the source character set to the
UTF-8 execution character set. */
struct cset_converter utf8_cset_desc;

/* Descriptor for converting from the source character set to the
UTF-16 execution character set. */
struct cset_converter char16_cset_desc;

/* Descriptor for converting from the source character set to the
UTF-32 execution character set. */
struct cset_converter char32_cset_desc;

/* Descriptor for converting from the source character set to the
wide execution character set. */
struct cset_converter wide_cset_desc;

/* Date and time text. Calculated together if either is requested. */
const unsigned char *date;
const unsigned char *time;

/* Time stamp, set idempotently lazily. */
time_t time_stamp;
int time_stamp_kind; /* Or errno. */

/* A token forcing paste avoidance, and one demarking macro arguments. */
cpp_token avoid_paste;
cpp_token endarg;

/* Opaque handle to the dependencies of mkdeps.cc. */
class mkdeps *deps;

/* Obstack holding all macro hash nodes. This never shrinks.
See identifiers.cc */
struct obstack hash_ob;

/* Obstack holding buffer and conditional structures. This is a
real stack. See directives.cc. */
struct obstack buffer_ob;

/* Pragma table - dynamic, because a library user can add to the
list of recognized pragmas. */
struct pragma_entry *pragmas;

/* Call backs to cpplib client. */
struct cpp_callbacks cb;

/* Identifier hash table. */
struct ht *hash_table;

/* Identifier ancillary data hash table. */
struct ht *extra_hash_table;

/* Expression parser stack. */
struct op *op_stack, *op_limit;

/* User visible options. */
struct cpp_options opts;

/* Special nodes - identifiers with predefined significance to the
preprocessor. */
struct spec_nodes spec_nodes;

/* Whether cpplib owns the hashtable. */
bool our_hashtable, our_extra_hashtable;

/* Traditional preprocessing output buffer (a logical line). */
struct
{
unsigned char *base;
unsigned char *limit;
unsigned char *cur;
location_t first_line;
} out;

/* Used for buffer overlays by traditional.cc. */
const unsigned char *saved_cur, *saved_rlimit, *saved_line_base;

/* A saved list of the defined macros, for dependency checking
of precompiled headers. */
struct cpp_savedstate *savedstate;

/* Next value of __COUNTER__ macro. */
unsigned int counter;

/* Table of comments, when state.save_comments is true. */
cpp_comment_table comments;

/* List of saved macros by push_macro. */
struct def_pragma_macro *pushed_macros;

/* If non-zero, the lexer will use this location for the next token
instead of getting a location from the linemap. */
location_t forced_token_location;

/* Location identifying the main source file -- intended to be line
zero of said file. */
location_t main_loc;

/* If non-zero, override diagnostic locations (other than DK_NOTE
diagnostics) to this one. */
location_t diagnostic_override_loc;

/* Returns true iff we should warn about UTF-8 bidirectional control
characters. */
bool warn_bidi_p () const
{
return (CPP_OPTION (this, cpp_warn_bidirectional)
& (bidirectional_unpaired|bidirectional_any));
}
}

/* An identifier hash table for cpplib and the front ends. */
struct ht
{
/* Identifiers are allocated from here. */
struct obstack stack; //负责此hash表中的内存分配

/*指向一个hashnode[nslots]数组的首地址，这个数组就是所谓的hash桶，数组中的每个元素都记录了一个具体元素的指针(所以每个元素叫做一个hashnode)
而hashnode具体的元素则是一个 ht_identifer,其只能代表一个字符串的内容，长度和hash.
此hash桶是自动扩展的，在ht搜索函数ht_lookup_with_hash中，若发现整个hash table超过3/4都满了，就会主动扩展此hash table(重新分配，复制，释放原有的)。*/

hashnode *entries;
/* Call back, allocate a node. */

/*
整个gcc源码中有两个alloc_node函数，一个定义在./gcc/stringpool.c中，一个定义在libcpp/identifiers.c中(libcpp这个目录是负责预处理和词法分析的).
* 对于cc1来说，其有自己的alloc_node函数，调用的总是 gcc/stringpools.c:alloc_node
* 而对于使用libcpp.a的其他程序，如果自己没有实现alloc_node函数，那么会默认使用./libcpp/identifier.c:alloc_node函数
alloc_node函数是用来分配节点内存的，分配后hashnode[]数组中的指针，也就指向这个内存中的元素，在ht搜索过程中(ht_lookup_with_hash),若发现需要新
插入一个元素，则就会调用alloc_node来分配内存，最终其返回值会被记录到hashnode[]中。
注: alloc_node可以为节点分配任意类型的结构体，只要最终返回此结构体中的一个ht_identifer结构体即可(./gcc/stringpool.c真正分配的是一个 lang_identifier树节点
*/
hashnode (*alloc_node) (cpp_hash_table *);
/* Call back, allocate something that hangs off a node like a cpp_macro.
NULL means use the usual allocator. */
void * (*alloc_subobject) (size_t);

unsigned int nslots; // hash中总共能存多少个指针
unsigned int nelements; /* Number of live elements. */

/* Link to reader, if any. For the benefit of cpplib. */
struct cpp_reader *pfile; //指向对应的cpp_reader(即parse_in)

/* Table usage statistics. */
unsigned int searches;
unsigned int collisions;

/* Should 'entries' be freed when it is no longer needed? */
bool entries_owned;
};

实现代码在：

toplev::main
=>lang_hooks.init_options = c_common_init_options
=> parse_in = cpp_create_reader() //1) 这里主要是对全局变量parse_in的初始化
=> do_compile
=> process_options
=> lang_hooks.post_options = c_common_post_options
=> cpp_read_main_file (parse_in, in_fnames[0]) //2) 这里主要负责打开并读入编译单元文件
=> compile_file();
=> lang_hooks.parse_file = c_common_parse_file()
=> c_parse_file ();

在这个函数中会初始化模块需要用到的几个参数：

init_stringpool (void)
{
/* Clean up if we're called more than once.
(We can't make this idempotent since identifiers contain state) */
if (ident_hash)
ht_destroy (ident_hash);
if (ident_hash_extra)
ht_destroy (ident_hash_extra);

/* Create with 16K (2^14) entries. */
ident_hash = ht_create (14);
ident_hash->alloc_node = alloc_node;
ident_hash->alloc_subobject = stringpool_ggc_alloc;

/* Create with 64 (2^6) entries. */
ident_hash_extra = ht_create (6);
ident_hash_extra->alloc_node = [] (cpp_hash_table *)
{
return HT_NODE (ggc_cleared_alloc<cpp_hashnode_extra> ());
};
ident_hash_extra->alloc_subobject = stringpool_ggc_alloc;
}
general_init (const char *argv0, bool init_signals, unique_argv original_argv)

{

.....

line_table = ggc_alloc<line_maps> ();
linemap_init (line_table, BUILTINS_LOCATION);
line_table->m_reallocator = realloc_for_line_map;
line_table->m_round_alloc_size = ggc_round_alloc_size;
line_table->default_range_bits = line_map_suggested_range_bits;

......

}

c_common_init_options (unsigned int decoded_options_count,
struct cl_decoded_option *decoded_options)
{
unsigned int i;
struct cpp_callbacks *cb;

g_string_concat_db
= new (ggc_alloc <string_concat_db> ()) string_concat_db ();

parse_in = cpp_create_reader (c_dialect_cxx () ? CLK_GNUCXX : CLK_GNUC89,
ident_hash, line_table, ident_hash_extra);
cb = cpp_get_callbacks (parse_in);
cb->diagnostic = c_cpp_diagnostic;

.....

}

/* Post-switch processing. */
bool
c_common_post_options (const char **pfilename)

{

...

*pfilename = this_input_filename
= cpp_read_main_file (parse_in, in_fnames[0],
/* We'll inject preamble pieces if this is
not preprocessed. */
!cpp_opts->preprocessed);

...

}

c_common_parse_file (void)
{
auto dumps = g->get_dumps ();
for (unsigned int i = 0;;)
{
c_finish_options ();
/* Open the dump file to use for the original dump output
here, to be used during parsing for the current file. */
dumps->dump_start (TDI_original, &dump_flags);
pch_init ();
push_file_scope ();
c_parse_file ();
pop_file_scope ();
/* And end the main input file, if the debug writer wants it */
if (debug_hooks->start_end_main_source_file)
(*debug_hooks->end_source_file) (0);
if (++i >= num_in_fnames)
break;
cpp_undef_all (parse_in);
cpp_clear_file_cache (parse_in);
this_input_filename
= cpp_read_main_file (parse_in, in_fnames[i]);
/* If an input file is missing, abandon further compilation.
cpplib has issued a diagnostic. */
if (!this_input_filename)
break;
dumps->dump_finish (TDI_original);
}

c_parse_final_cleanups ();
dumps->dump_finish (TDI_original);
}

/* Wrapper around cpp_get_token to skip CPP_PADDING tokens
and not consume CPP_EOF. This does not perform the optional
streaming in preprocess_only mode, so is suitable to be used
when processing builtin expansions such as c_common_has_attribute. */

static const cpp_token *
get_token_no_padding (cpp_reader *pfile)
{
for (;;)
{
const cpp_token *ret = cpp_peek_token (pfile, 0);
if (ret->type == CPP_EOF)
return ret;
ret = cpp_get_token (pfile);
if (ret->type != CPP_PADDING)
return ret;
}
}

查看全文

http://www.lryc.cn/news/587319.html