如何阅读源代码之二(转帖) -- 业余空间-- 编程爱好者博客

作为一个C程序，在头文件里面，和C文件里面定义的extern变量，结构等等肯定不会少，但是，单独看这些东西我们不可能对这个程序有什么认识。所以，从main函数入手，逐步分析，在需要的时候再回头来看这些数据结构定义才是好的方法。（顺便说一句，Visual C++, 等windows下的IDE工具提供了很方便的方法来获取函数列表，C++的类列表以及资源文件，对于阅读源代码很有帮助。Unix/Linux也有这些工具，但是，我们在这里暂时不说，而只是通过最简单的文本编辑器vi来讲)。跳过webalizer.c开头的版权说明部分（GPL的），和数据结构定义，全局变量声明部分，直接进入main()函数。在函数开头，我们看到：/* initalize epoch */epoch=jdate(1,1,1970); /* used for timestamp adj. *//* add default index. alias */add_nlist("index.",&index_alias);这两个函数暂时不用仔细看，后面会提到，略过。sprintf(tmp_buf,"%s/webalizer.conf",ETCDIR);/* check for default config file */if (!access("webalizer.conf",F_OK))get_config("webalizer.conf");else if (!access(tmp_buf,F_OK))get_config(tmp_buf);从注释和程序本身可以看出，这是查找是否存在一个叫做webalizer.conf的配置文件，如果当前目录下有，则用get_config来读入其中内容，如果没有，则查找ETCDIR/webalizer.conf是否存在。如果都没有，则进入下一部分。(注意：ETCDIR = @ETCDIR@在makefile中有定义）/* get command line options */opterr = 0; /* disable parser errors */while ((i=getopt(argc,argv,"a:A:c:C:dD:e:E:fF:g:GhHiI:l:Lm:M:n:N:o:pP:qQr:R:s:S:t:Tu:U:vVx:XY"))!=EOF){switch (i){case 'a': add_nlist(optarg,&hidden_agents); break; /* Hide agents */case 'A': ntop_agents=atoi(optarg); break; /* Top agents */case 'c': get_config(optarg); break; /* Config file */case 'C': ntop_ctrys=atoi(optarg); break; /* Top countries */case 'd': debug_mode=1; break; /* Debug */case 'D': dns_cache=optarg; break; /* DNS Cache filename */case 'e': ntop_entry=atoi(optarg); break; /* Top entry pages */case 'E': ntop_exit=atoi(optarg); break; /* Top exit pages */case 'f': fold_seq_err=1; break; /* Fold sequence errs */case 'F': log_type=(optarg[0]=='f')?LOG_FTP:(optarg[0]=='s')?LOG_SQUID:LOG_CLF; break; /* define log type */case 'g': group_domains=atoi(optarg); break; /* GroupDomains (0=no) */case 'G': hourly_graph=0; break; /* no hourly graph */case 'h': print_opts(argv[0]); break; /* help */case 'H': hourly_stats=0; break; /* no hourly stats */case 'i': ignore_hist=1; break; /* Ignore history */case 'I': add_nlist(optarg,&index_alias); break; /* Index alias */case 'l': graph_lines=atoi(optarg); break; /* Graph Lines */case 'L': graph_legend=0; break; /* Graph Legends */case 'm': visit_timeout=atoi(optarg); break; /* Visit Timeout */case 'M': mangle_agent=atoi(optarg); break; /* mangle user agents */case 'n': hname=optarg; break; /* Hostname */case 'N': dns_children=atoi(optarg); break; /* # of DNS children */case 'o': out_dir=optarg; break; /* Output directory */case 'p': incremental=1; break; /* Incremental run */case 'P': add_nlist(optarg,&page_type); break; /* page view types */case 'q': verbose=1; break; /* Quiet (verbose=1) */case 'Q': verbose=0; break; /* Really Quiet */case 'r': add_nlist(optarg,&hidden_refs); break; /* Hide referrer */case 'R': ntop_refs=atoi(optarg); break; /* Top referrers */case 's': add_nlist(optarg,&hidden_sites); break; /* Hide site */case 'S': ntop_sites=atoi(optarg); break; /* Top sites */case 't': msg_title=optarg; break; /* Report title */case 'T': time_me=1; break; /* TimeMe */case 'u': add_nlist(optarg,&hidden_urls); break; /* hide URL */case 'U': ntop_urls=atoi(optarg); break; /* Top urls */case 'v':case 'V': print_version(); break; /* Version */case 'x': html_ext=optarg; break; /* HTML file extension */case 'X': hide_sites=1; break; /* Hide ind. sites */case 'Y': ctry_graph=0; break; /* Supress ctry graph */}}if (argc - optind != 0) log_fname = argv[optind];if ( log_fname && (log_fname[0]=='-')) log_fname=NULL; /* force STDIN? *//* check for gzipped file - .gz */if (log_fname) if (!strcmp((log_fname+strlen(log_fname)-3),".gz")) gz_log=1;这一段是分析命令行参数及开关。（getopt()的用法我在另外一篇文章中讲过，这里就不再重复了。）可以看到，这个软件虽然功能不太复杂，但是开关选项还是不少。大多数的unix/linux程序的开头部分都是这个套路，初始化配置文件，并且读入分析命令行。在这段程序中，我们需要注意一个函数：add_nlist(). print_opts(), get_config()等等一看就明白，就不用多讲了。这里我们已经是第二次遇到add_nlist这个函数了，就仔细看看吧。$ grep add_nlist *.hlinklist.h:extern int add_nlist(char *, NLISTPTR *); /* add list item */可以发现它定义在linklist.h中。在这个h文件中，当然会有一些数据结构的定义，比如：struct nlist { char string[80]; /* list struct for HIDE items */struct nlist *next; };typedef struct nlist *NLISTPTR;struct glist { char string[80]; /* list struct for GROUP items */char name[80];struct glist *next; };typedef struct glist *GLISTPTR;这是两个链表结构。还有extern GLISTPTR group_sites ; /* "group" lists */extern GLISTPTR group_urls ;extern GLISTPTR group_refs ;这些都是链表，太多了，不用一一看得很仔细，因为目前也看不出来什么东西。当然要注意它们是extern的，也就是说，可以在其他地方(文件）看到它们的数值（类似于C++中的public变量）。这里还定义了4个函数：extern char *isinlist(NLISTPTR, char *); /* scan list for str */extern char *isinglist(GLISTPTR, char *); /* scan glist for str */extern int add_nlist(char *, NLISTPTR *); /* add list item */extern int add_glist(char *, GLISTPTR *); /* add group list item */注意，这些都是extern的，也就是说，可以在其他地方见到它们的调用(有点相当于C++中的public函数）。再来看看linklist.c，NLISTPTR new_nlist(char *); /* new list node */void del_nlist(NLISTPTR *); /* del list */GLISTPTR new_glist(char *, char *); /* new group list node */void del_glist(GLISTPTR *); /* del group list */int isinstr(char *, char *);这5个函数是内部使用的（相当于C++中的private), 也就是说，这些函数只被isinlist(NLISTPTR, char *), isinglist(GLISTPTR, char *), add_nlist(char *, NLISTPTR *), add_glist(char *, GLISTPTR *)调用，而不会出现在其他地方。所以，我们先来看这几个内部函数。举例来说，add_nlist(char *)NLISTPTR new_nlist(char *str){NLISTPTR newptr;if (sizeof(newptr->string) < strlen(str)){if (verbose)fprintf(stderr,"[new_nlist] %s ",msg_big_one);}if (( newptr = malloc(sizeof(struct nlist))) != NULL){strncpy(newptr->string, str, sizeof(newptr->string));newptr->next=NULL;}return newptr;}这个函数分配了一个struct nlist, 并且把其中的string赋值为str, next赋值为NULL.这实际上是创建了链表中的一个节点。verbose是一个全局变量，定义了输出信息的类型，如果verbose为1，则输出很详细的信息，否则输出简略信息。这是为了调试或者使用者详细了解程序情况来用的。不是重要内容，虽然我们常常可以在这个源程序的其他地方看到它。另外一个函数：void del_nlist(NLISTPTR *list){NLISTPTR cptr,nptr;cptr=*list;while (cptr!=NULL){nptr=cptr->next;free(cptr);cptr=nptr;}}这个函数删除了一个nlist（也可能是list所指向的那一个部分开始知道链表结尾），比较简单。看完了这两个内部函数，可以来看/*********************************************//* ADD_NLIST - add item to FIFO linked list *//*********************************************/int add_nlist(char *str, NLISTPTR *list){NLISTPTR newptr,cptr,pptr;if ( (newptr = new_nlist(str)) != NULL){if (*list==NULL) *list=newptr;else{cptr=pptr=*list;while(cptr!=NULL) { pptr=cptr; cptr=cptr->next; };pptr->next = newptr;}}return newptr==NULL;}这个函数是建立了一个新的节点，把参数str赋值给新节点的string, 并把它连接到list所指向链表的结尾。另外的三个函数：new_glist(), del_glist(), add_glist()完成的功能和上述三个差不多，所不同的只是它们所处理的数据结构不同。看完了这几个函数，我们回到main程序。接下来是，/* setup our internal variables */init_counters(); /* initalize main counters */我们所阅读的这个软件是用来分析日志并且做出统计的，那么这个函数的名字已经告诉了我们，这是一个初始化计数器的函数。简略的看看吧！$ grep init_counters *.hwebalizer.h:extern void init_counters();在webalizer.c中找到：void init_counters(){int i;for (i=0;i for (i=0;i<31;i++) /* monthly totals */{tm_xfer[i]=0.0;tm_hit[i]=tm_file[i]=tm_site[i]=tm_page[i]=tm_visit[i]=0;}for (i=0;i<24;i++) /* hourly totals */{th_hit[i]=th_file[i]=th_page[i]=0;th_xfer[i]=0.0;}......}略过去一大串代码，不用看了，肯定是计数器清0。在主程序中，接下来是：if (page_type==NULL) /* check if page types present */{if ((log_type == LOG_CLF) || (log_type == LOG_SQUID)){add_nlist("htm*" ,&page_type); /* if no page types specified, we */add_nlist("cgi" ,&page_type); /* use the default ones here... */if (!isinlist(page_type,html_ext)) add_nlist(html_ext,&page_type);}else add_nlist("txt" ,&page_type); /* FTP logs default to .txt */}page_type这个变量在前面见过，case 'P': add_nlist(optarg,&page_type); break; /* page view types根据在最开始读过的README文件，这个page_type是用来定义处理的页面的类型的。在README文件中，-P name Page type. This is the extension of files you consider tobe pages for Pages calculations (sometimes called 'pageviews').The default is 'htm*' and 'cgi' (plus whatever HTMLExtensionyou specified if it is different). Don't use a period!我们在程序中也可以看到，如果没有在命令行中或者config文件中指定，则根据处理的日志文件的类型来添加缺省的文件类型。比如对于CLF文件(WWW日志)，处理html, htm, cgi文件if (log_type == LOG_FTP){/* disable stuff for ftp logs */ntop_entry=ntop_exit=0;ntop_search=0;}else.....这一段是对于FTP的日志格式，设置搜索列表。for (i=0;i {sm_htab[i]=sd_htab[i]=NULL; /* initalize hash tables */um_htab[i]=NULL;rm_htab[i]=NULL;am_htab[i]=NULL;sr_htab[i]=NULL;}清空哈西表，为下面即将进行的排序工作做好准备。关于哈西表，这是数据结构中常用的一种用来快速排序的结构，如果不清楚，可以参考相关书籍，比如清华的<<数据结构>>教材或者<<数据结构的C++实现>>等书。if (verbose>1){uname(&system_info);printf("Webalizer V%s-%s (%s %s) %s ",version,editlvl,system_info.sysname,system_info.release,language);}这一段，是打印有关系统的信息和webalizer程序的信息（可以参考uname的函数说明）。#ifndef USE_DNSif (strstr(argv[0],"webazolver")!=0){printf("DNS support not present, aborting... ");exit(1);}#endif /* USE_DNS */这一段，回忆我们在看README文件的时候，曾经提到过可以在编译的时候设置选项开关来设定DNS支持，在源代码中可以看到多次这样的代码段出现，如果不指定DNS支持，这些代码段则会出现（ifdef)或者不出现(ifndef).下面略过这些代码段，不再重复。/* open log file */if (gz_log){gzlog_fp = gzopen(log_fname,"rb");if (gzlog_fp==Z_NULL){/* Error: Can't open log file ... */fprintf(stderr, "%s %s ",msg_log_err,log_fname);exit(1);}}else{if (log_fname){log_fp = fopen(log_fname,"r");if (log_fp==NULL){/* Error: Can't open log file ... */fprintf(stderr, "%s %s ",msg_log_err,log_fname);exit(1);}}}这一段，回忆在README文件中曾经读到过，如果log文件是gzip压缩格式，则用gzopen函数打开（可以猜想gz***是一套针对gzip压缩格式的实时解压缩函数），如果不是，则用fopen打开。/* switch directories if needed */if (out_dir){if (chdir(out_dir) != 0){/* Error: Can't change directory to ... */fprintf(stderr, "%s %s ",msg_dir_err,out_dir);exit(1);}}同样，回忆在README文件中读到过，如果参数行有-o out_dir, 则将输出结果到该目录，否则，则输出到当前目录。在这一段中，如果输出目录不存在(chdir(out_dir) != 0)则出错。#ifdef USE_DNSif (strstr(argv[0],"webazolver")!=0){if (!dns_children) dns_children=5; /* default dns children if needed */if (!dns_cache){/* No cache file specified, aborting... */fprintf(stderr,"%s ",msg_dns_nocf); /* Must have a cache file */exit(1);}}......

博客介绍

正文

如何阅读源代码之二(转帖)2006-12-06 22:50:00

评论