gcc 源码阅读--C语言预处理

在c/c++语言中,编译器第一项处理的就是做预处理,

比如#include ,#ifdef #endif  #if 等等

GCC实现这部分代码放在一个单独的目录中,libcpp

这里面有几个重要结构需要了解:

struct GTY(()) cpp_token {

  /* Location of first char of token, together with range of full token.  */
  location_t src_loc;
   // 记录词法元素中第一个字符的源码位置 


  ENUM_BITFIELD(cpp_ttype) type : CHAR_BIT;  /* token type */
  //对应的类型,CPP_NUMBER
  unsigned short flags;     /* flags - see above */

  union cpp_token_u
  {
   // 代表的是为各个词法元素最终建立的值节点,不同的词法元素使用不同的结构体。
    /* An identifier.  */
    struct cpp_identifier GTY ((tag ("CPP_TOKEN_FLD_NODE"))) node;

    /* Inherit padding from this token.  */
    cpp_token * GTY ((tag ("CPP_TOKEN_FLD_SOURCE"))) source;

    /* A string, or number.  */
    struct cpp_string GTY ((tag ("CPP_TOKEN_FLD_STR"))) str;

    /* Argument no. (and original spelling) for a CPP_MACRO_ARG.  */
    struct cpp_macro_arg GTY ((tag ("CPP_TOKEN_FLD_ARG_NO"))) macro_arg;

    /* Original token no. for a CPP_PASTE (from a sequence of
       consecutive paste tokens in a macro expansion).  */
    unsigned int GTY ((tag ("CPP_TOKEN_FLD_TOKEN_NO"))) token_no;

    /* Caller-supplied identifier for a CPP_PRAGMA.  */
    unsigned int GTY ((tag ("CPP_TOKEN_FLD_PRAGMA"))) pragma;
  } GTY ((desc ("cpp_token_val_index (&%1)"))) val;
};
struct GTY (()) c_token {
  /* The kind of token.  */
  ENUM_BITFIELD (cpp_ttype) type : 8;
  /* If this token is a CPP_NAME, this value indicates whether also
     declared as some kind of type.  Otherwise, it is C_ID_NONE.  */
  ENUM_BITFIELD (c_id_kind) id_kind : 8;//关键字标识符
  /* If this token is a keyword, this value indicates which keyword.
     Otherwise, this value is RID_MAX.  */
  ENUM_BITFIELD (rid) keyword : 8;
  /* If this token is a CPP_PRAGMA, this indicates the pragma that
     was seen.  Otherwise it is PRAGMA_NONE.  */
  ENUM_BITFIELD (pragma_kind) pragma_kind : 8;
  /* The location at which this token was found.  */
  location_t location;
  /* The value associated with this token, if any.  */
  tree value;
  /* Token flags.  */
  unsigned char flags;

  source_range get_range () const
  {
    return get_range_from_loc (line_table, location);
  }

  location_t get_finish () const
  {
    return get_range ().m_finish;
  }
};

struct c_parser {

  c_token * tokens;    /* 当前正在处理的语法符号c_token的地址,这里除了初始化时,应该指向 tokens_buf[0] */

  c_token tokens_buf[4];  /* c_token预读缓存,按照gcc的语法分析原理,预读不会超过4个语法符号 */

  unsigned int tokens_avail;    /* tokens_buf中可用的预读词法符号的数目 */
。。。。。。
}

struct cpp_reader
{
  /* Top of buffer stack.  */
  cpp_buffer *buffer;

  /* Overlaid buffer (can be different after processing #include).  */
  cpp_buffer *overlaid_buffer;

  /* Lexer state.  */
  struct lexer_state state;

  /* Source line tracking.  */
  class line_maps *line_table;

  /* The line of the '#' of the current directive.  */
  location_t directive_line;

  /* Memory buffers.  */
  _cpp_buff *a_buff;        /* Aligned permanent storage.  */
  _cpp_buff *u_buff;        /* Unaligned permanent storage.  */
  _cpp_buff *free_buffs;    /* Free buffer chain.  */

  /* Context stack.  */
  struct cpp_context base_context;
  struct cpp_context *context;

  /* If in_directive, the directive if known.  */
  const struct directive *directive;

  /* Token generated while handling a directive, if any. */
  cpp_token directive_result;

  /* When expanding a macro at top-level, this is the location of the
     macro invocation.  */
  location_t invocation_location;

  /* This is the node representing the macro being expanded at
     top-level.  The value of this data member is valid iff
     cpp_in_macro_expansion_p() returns TRUE.  */
  cpp_hashnode *top_most_macro_node;

  /* Nonzero if we are about to expand a macro.  Note that if we are
     really expanding a macro, the function macro_of_context returns
     the macro being expanded and this flag is set to false.  Client
     code should use the function cpp_in_macro_expansion_p to know if we
     are either about to expand a macro, or are actually expanding
     one.  */
  bool about_to_expand_macro_p;

  /* True if the preprocessor should diagnose CPP_DOT or CPP_COLON
     tokens as the first ones coming from macro expansion.  */
  bool diagnose_dot_colon_from_macro_p;

  /* Search paths for include files.  */
  struct cpp_dir *quote_include;    /* "" */
  struct cpp_dir *bracket_include;    /* <> */
  struct cpp_dir no_search_path;    /* No path.  */
  struct cpp_dir *embed_include;    /* #embed <> */

  /* Chain of all hashed _cpp_file instances.  */
  struct _cpp_file *all_files;

  struct _cpp_file *main_file;

  /* File and directory hash table.  */
  struct htab *file_hash;
  struct htab *dir_hash;
  struct file_hash_entry_pool *file_hash_entries;

  /* Negative path lookup hash table.  */
  struct htab *nonexistent_file_hash;
  struct obstack nonexistent_file_ob;

  /* Nonzero means don't look for #include "foo" the source-file
     directory.  */
  bool quote_ignores_source_dir;

  /* Nonzero if any file has contained #pragma once or #import has
     been used.  */
  bool seen_once_only;

  /* Multiple include optimization and -Wheader-guard warning.  */
  const cpp_hashnode *mi_cmacro;
  const cpp_hashnode *mi_ind_cmacro;
  const cpp_hashnode *mi_def_cmacro;
  location_t mi_loc, mi_def_loc;
  bool mi_valid;

  /* Lexing.  */
  cpp_token *cur_token;
  tokenrun base_run, *cur_run;
  unsigned int lookaheads;

  /* Nonzero prevents the lexer from re-using the token runs.  */
  unsigned int keep_tokens;

  /* Buffer to hold macro definition string.  */
  unsigned char *macro_buffer;
  unsigned int macro_buffer_len;

  /* Descriptor for converting from the source character set to the
     execution character set.  */
  struct cset_converter narrow_cset_desc;

  /* Descriptor for converting from the source character set to the
     UTF-8 execution character set.  */
  struct cset_converter utf8_cset_desc;

  /* Descriptor for converting from the source character set to the
     UTF-16 execution character set.  */
  struct cset_converter char16_cset_desc;

  /* Descriptor for converting from the source character set to the
     UTF-32 execution character set.  */
  struct cset_converter char32_cset_desc;

  /* Descriptor for converting from the source character set to the
     wide execution character set.  */
  struct cset_converter wide_cset_desc;

  /* Date and time text.  Calculated together if either is requested.  */
  const unsigned char *date;
  const unsigned char *time;

  /* Time stamp, set idempotently lazily.  */
  time_t time_stamp;
  int time_stamp_kind; /* Or errno.  */

  /* A token forcing paste avoidance, and one demarking macro arguments.  */
  cpp_token avoid_paste;
  cpp_token endarg;

  /* Opaque handle to the dependencies of mkdeps.cc.  */
  class mkdeps *deps;

  /* Obstack holding all macro hash nodes.  This never shrinks.
     See identifiers.cc */
  struct obstack hash_ob;

  /* Obstack holding buffer and conditional structures.  This is a
     real stack.  See directives.cc.  */
  struct obstack buffer_ob;

  /* Pragma table - dynamic, because a library user can add to the
     list of recognized pragmas.  */
  struct pragma_entry *pragmas;

  /* Call backs to cpplib client.  */
  struct cpp_callbacks cb;

  /* Identifier hash table.  */
  struct ht *hash_table;

  /* Identifier ancillary data hash table.  */
  struct ht *extra_hash_table;

  /* Expression parser stack.  */
  struct op *op_stack, *op_limit;

  /* User visible options.  */
  struct cpp_options opts;

  /* Special nodes - identifiers with predefined significance to the
     preprocessor.  */
  struct spec_nodes spec_nodes;

  /* Whether cpplib owns the hashtable.  */
  bool our_hashtable, our_extra_hashtable;

  /* Traditional preprocessing output buffer (a logical line).  */
  struct
  {
    unsigned char *base;
    unsigned char *limit;
    unsigned char *cur;
    location_t first_line;
  } out;

  /* Used for buffer overlays by traditional.cc.  */
  const unsigned char *saved_cur, *saved_rlimit, *saved_line_base;

  /* A saved list of the defined macros, for dependency checking
     of precompiled headers.  */
  struct cpp_savedstate *savedstate;

  /* Next value of __COUNTER__ macro. */
  unsigned int counter;

  /* Table of comments, when state.save_comments is true.  */
  cpp_comment_table comments;

  /* List of saved macros by push_macro.  */
  struct def_pragma_macro *pushed_macros;

  /* If non-zero, the lexer will use this location for the next token
     instead of getting a location from the linemap.  */
  location_t forced_token_location;

  /* Location identifying the main source file -- intended to be line
     zero of said file.  */
  location_t main_loc;

  /* If non-zero, override diagnostic locations (other than DK_NOTE
     diagnostics) to this one.  */
  location_t diagnostic_override_loc;

  /* Returns true iff we should warn about UTF-8 bidirectional control
     characters.  */
  bool warn_bidi_p () const
  {
    return (CPP_OPTION (this, cpp_warn_bidirectional)
        & (bidirectional_unpaired|bidirectional_any));
  }
}


/* An identifier hash table for cpplib and the front ends.  */
struct ht
{
  /* Identifiers are allocated from here.  */
  struct obstack stack;  //负责此hash表中的内存分配

   /*指向一个hashnode[nslots]数组的首地址,这个数组就是所谓的hash桶,数组中的每个元素都记录了一个具体元素的指针(所以每个元素叫做一个hashnode)
    而hashnode具体的元素则是一个 ht_identifer,其只能代表一个字符串的内容,长度和hash.
    此hash桶是自动扩展的,在ht搜索函数ht_lookup_with_hash中,若发现整个hash table超过3/4都满了,就会主动扩展此hash table(重新分配,复制,释放原有的)。*/

  hashnode *entries;
  /* Call back, allocate a node.  */

/*
     整个gcc源码中有两个alloc_node函数,一个定义在./gcc/stringpool.c中,一个定义在libcpp/identifiers.c中(libcpp这个目录是负责预处理和词法分析的).
     * 对于cc1来说,其有自己的alloc_node函数,调用的总是 gcc/stringpools.c:alloc_node
     * 而对于使用libcpp.a的其他程序,如果自己没有实现alloc_node函数,那么会默认使用./libcpp/identifier.c:alloc_node函数
     alloc_node函数是用来分配节点内存的,分配后hashnode[]数组中的指针,也就指向这个内存中的元素,在ht搜索过程中(ht_lookup_with_hash),若发现需要新
     插入一个元素,则就会调用alloc_node来分配内存,最终其返回值会被记录到hashnode[]中。
    注: alloc_node可以为节点分配任意类型的结构体,只要最终返回此结构体中的一个ht_identifer结构体即可(./gcc/stringpool.c真正分配的是一个 lang_identifier树节点
  */
  hashnode (*alloc_node) (cpp_hash_table *);
  /* Call back, allocate something that hangs off a node like a cpp_macro.  
     NULL means use the usual allocator.  */
  void * (*alloc_subobject) (size_t);

  unsigned int nslots;      // hash中总共能存多少个指针
  unsigned int nelements;   /* Number of live elements.  */

  /* Link to reader, if any.  For the benefit of cpplib.  */
  struct cpp_reader *pfile;  //指向对应的cpp_reader(即parse_in)

  /* Table usage statistics.  */
  unsigned int searches;
  unsigned int collisions;

  /* Should 'entries' be freed when it is no longer needed?  */
  bool entries_owned;
};

实现代码在:

toplev::main 
    =>lang_hooks.init_options = c_common_init_options
    => parse_in = cpp_create_reader()                         //1) 这里主要是对全局变量parse_in的初始化
  => do_compile
    => process_options
      => lang_hooks.post_options = c_common_post_options
        => cpp_read_main_file (parse_in, in_fnames[0])        //2) 这里主要负责打开并读入编译单元文件
    => compile_file();
      => lang_hooks.parse_file = c_common_parse_file()
        => c_parse_file ();  
 

在这个函数中会初始化模块需要用到的几个参数:

init_stringpool (void)
{
  /* Clean up if we're called more than once.
     (We can't make this idempotent since identifiers contain state) */
  if (ident_hash)
    ht_destroy (ident_hash);
  if (ident_hash_extra)
    ht_destroy (ident_hash_extra);

  /* Create with 16K (2^14) entries.  */
  ident_hash = ht_create (14);
  ident_hash->alloc_node = alloc_node;
  ident_hash->alloc_subobject = stringpool_ggc_alloc;

  /* Create with 64 (2^6) entries.  */
  ident_hash_extra = ht_create (6);
  ident_hash_extra->alloc_node = [] (cpp_hash_table *)
  {
    return HT_NODE (ggc_cleared_alloc<cpp_hashnode_extra> ());
  };
  ident_hash_extra->alloc_subobject = stringpool_ggc_alloc;
}
general_init (const char *argv0, bool init_signals, unique_argv original_argv)

{

.....

line_table = ggc_alloc<line_maps> ();
  linemap_init (line_table, BUILTINS_LOCATION);
  line_table->m_reallocator = realloc_for_line_map;
  line_table->m_round_alloc_size = ggc_round_alloc_size;
  line_table->default_range_bits = line_map_suggested_range_bits;

......

}

   

c_common_init_options (unsigned int decoded_options_count,
               struct cl_decoded_option *decoded_options)
{
  unsigned int i;
  struct cpp_callbacks *cb;

  g_string_concat_db
    = new (ggc_alloc <string_concat_db> ()) string_concat_db ();

  parse_in = cpp_create_reader (c_dialect_cxx () ? CLK_GNUCXX : CLK_GNUC89,
                ident_hash, line_table, ident_hash_extra);

  cb = cpp_get_callbacks (parse_in);
  cb->diagnostic = c_cpp_diagnostic;

.....

}

/* Post-switch processing.  */
bool
c_common_post_options (const char **pfilename)  

{

...

 *pfilename = this_input_filename
    = cpp_read_main_file (parse_in, in_fnames[0],
              /* We'll inject preamble pieces if this is
                 not preprocessed.  */
              !cpp_opts->preprocessed);

...

}

c_common_parse_file (void)
{
  auto dumps = g->get_dumps ();
  for (unsigned int i = 0;;)
    {
      c_finish_options ();
      /* Open the dump file to use for the original dump output
         here, to be used during parsing for the current file.  */
      dumps->dump_start (TDI_original, &dump_flags);
      pch_init ();
      push_file_scope ();
      c_parse_file ();
      pop_file_scope ();
      /* And end the main input file, if the debug writer wants it  */
      if (debug_hooks->start_end_main_source_file)
    (*debug_hooks->end_source_file) (0);
      if (++i >= num_in_fnames)
    break;
      cpp_undef_all (parse_in);
      cpp_clear_file_cache (parse_in);
      this_input_filename
    = cpp_read_main_file (parse_in, in_fnames[i]);

      /* If an input file is missing, abandon further compilation.
     cpplib has issued a diagnostic.  */
      if (!this_input_filename)
    break;
      dumps->dump_finish (TDI_original);
    }

  c_parse_final_cleanups ();
  dumps->dump_finish (TDI_original);
}

 

/* Wrapper around cpp_get_token to skip CPP_PADDING tokens
   and not consume CPP_EOF.  This does not perform the optional
   streaming in preprocess_only mode, so is suitable to be used
   when processing builtin expansions such as c_common_has_attribute.  */

static const cpp_token *
get_token_no_padding (cpp_reader *pfile)
{
  for (;;)
    {
      const cpp_token *ret = cpp_peek_token (pfile, 0);
      if (ret->type == CPP_EOF)
    return ret;
      ret = cpp_get_token (pfile);
      if (ret->type != CPP_PADDING)
    return ret;
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

GoldKey

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值