/* vim: set sw=8 ts=8 si : */
/* Author: Guido Socher, Copyright: GPL */
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <unistd.h>
#include <ctype.h>
#include <strings.h>
#include <string.h>
#include "htag.h"
#include "hash.h"
#include "config.h"
static char message[MAXFIFO_STRLEN+1];
static char pathstr[MAXTAGLEN+1];
static char linkpath[MAXTAGLEN+1];
static char *filename; /* the file name currently read */
static int typeoflink=0; /* 0 initial val, 1 href, 2 src, 3 background, 4 name */
static char *typeoflinkstr[]={"","href","src","background","name"};
static int tot_count=0;
static int rel_count=0;
static int broken_count=0;
/* for option parser */
static int opt_a=0; /* abs links of form proto:// */
static char *opt_d=NULL; /* points to docroot if set */
static int opt_f=0; /* 2 abs file sys lnk */
static int opt_A=0;
static int opt_F=0;
static int opt_s=0;
static int opt_O=0;
static int opt_n=0;
static int opt_w=0;
static char **opt_w_list;
static char **opt_n_list;
/*end global data*/

void help()
{
        printf("blnkcheck -- check links in html pages\n\
\n\
USAGE: blnkcheck [-AafhOs] [-d docroot] [-n list] [-w list ] html-files\n\
\n\
OPTIONS: -h this help\n\
         -a print absolute links of the form \"proto://\" and not check\n\
            relative links\n\
         -d document root directory to check abs. filesystem links\n\
	    e.g -d /home/httpd/html\n\
         -f print the absolute file system links of the form \"/xxx\" and\n\
            do not check relative links\n\
         -n ignore links that match a given sub-string.\n\
	    List is a comma (or space) separated list of the from\n\
	    substring1,substring2,...\n\
	    This function is not case sensitive.\n\
         -s print some statistic about checked links.\n\
         -w warn about absolute links matching a given sub-string.\n\
	    List is a comma (or space) separated list of the from\n\
	    substring1,substring2,...\n\
	    This function is not case sensitive.\n\
         -F do not warn about file:/ links\n\
         -A do not open any other files than the files given on the\n\
	    command line in order to check for the existence of\n\
	    named anchors. E.g the file index.html may have a href to\n\
	    somepage.html#ref Running \"blnkcheck -A index.html\" will\n\
	    only check that the file somepage.html exists. The file will not\n\
	    be opened to check if there is a named anchor called \"ref\".\n\
	 -O do not warn when a file is not world readable.\n\
\n\
EXAMPLE: check relative broken links in the file tree \n\
 below /home/httpd/html:\n\
  (cd /home/httpd/html;blnkcheck `find . -name \\*.html` | sort)\n\
 check a single web-page called index.html:\n\
  blnkcheck index.html\n\
\n\
This program is very fast as it does not need a web-server. It searches\n\
only the plain file system. Error messages are of the same format as gcc\n\
error messages and can be read by many common editors (e.g vim or emacs).\n");
#ifdef VERINFO
	puts(VERINFO);
#endif
	exit(0);
}
/* lower case a sting */
void lowcasestr(char *str){
	while(*str){
		*str=(char)tolower((char)*str);
		str++;
	}
}
/* Convert a string list (space or comma seperated) into an
 * array of lower case strings and return a pointer to it.
 * string="aaa bb,cc,  dd" becomes *result[]={"aaa","bb","cc",dd"}
 * Only the 50 first elements from string are taken an the rest is
 * ignored.
 */
char **string_to_list(char *string){
	char *dat;
	char *chptr;
	char **array;
	int i=0;

	dat=(char *)malloc(strlen(string)+1);
	array=(char **)malloc(sizeof(char *)*51);
	strcpy(dat,string);
	lowcasestr(dat);
	while(*dat && i <50){
		/* walk through space and comma */
		while(*dat && (*dat == '\t'||*dat == ' '||*dat == ',')){
			*dat='\0';
			dat++;
		}
		*(array+i)=dat;
		if (*dat) i++;
		/* walk through the element */
		while(*dat && *dat != '\t' && *dat != ' ' && *dat != ','){
			dat++;
		}
	}
	*(array+i)=NULL;
	return(array);
}
/* check if any string in strlist is a subsring of
 * string this is case insensitive if strlist is lower case.*/
int str_list_is_substring_of(char **strlist,char *string){
	int i=0;
	char strbuf[MAXFIFO_STRLEN+1];
	strncpy(strbuf,string,MAXFIFO_STRLEN);
	strbuf[MAXFIFO_STRLEN]=0;
	lowcasestr(strbuf);
	while(*strlist){
		if (strstr(strbuf,*strlist)){
			i=1;
			break;
		}
		strlist++;
	}
	return(i);
}
/* return the name of the directory from a full path file name 
 * the path returned does not end in "/" 
 * return "." if it is a plain file without directory */
char *dirname(char *filename){
	static char string[MAXTAGLEN+1];
	char *chptr;

	strncpy(string,filename,MAXTAGLEN-1);
	string[MAXTAGLEN]='\0';
	chptr=strrchr(string,'/');
	if (chptr){
		*chptr='\0';
	}else{
		string[0]='.';
		string[1]='\0';
	}
	return(string);
}
/* check if this is an empty string. Return 1 if it is empty or
 * consists only of white space */
int is_empty(char *s){
	while (*s){
		if (*s == ' ' || *s == '\t'){
			s++;
		}else return(0);
	}
	return(1);
}

/* 
 * concatinate strings until a max length is reached
 * you can call this function serveral times to append more strings
 * i is a pointer to a interger that is used to count how many
 * characters have been copied so far. usr i=0 when calling the
 * first time. 
 * use this as follows:
 * int ii;
 * char d[20];
 *
 * ii=0;
 * strappend(d,"xxxx",&ii,20)
 * strappend(d,"yyy",&ii,20)
 */
int strappend(char *dest,char *source,int *i,int maxdestlen){
	if (*i > maxdestlen) *i=maxdestlen;
	while(*source && (*i < maxdestlen)){
		dest[*i]=*source;
		source++;
		(*i)++;
	}
	dest[*i]='\0';
	return(*i);
}
/* 
 * Check if this is an abs or a rel link.
 * abs file system path (something that starts with /)-> retrun 2
 * #xxx (something that starts with #) -> retrun 1
 * file: -> retrun 5
 * http:// https:// ftp:// (anything with ^\w+:// except file:) -> retrun 3
 * mailto: or javascript: (anything with ^\w+:[^/][^/] )-> retrun 4 
 * definitly a rel link (starts with . or 
 * is a word not followed by ://)-> return 0 
 */
int is_abs(char *string){
	int sstate=0;
	int ccount=0;
	char *dummy;
	/* file system abs path:*/
	if (*string=='/') return(2);
	/* something that starts with dot is definitly a rel link*/
	if (*string=='.') return(0);
	if (*string=='#') return(1);
	if (matchpat(string,"^file:",&dummy)) return(5);
	if (matchpat(string,"^\\w\\w*://",&dummy)) return(3);
	if (matchpat(string,"^\\w\\w*:",&dummy)) return(4);
	return(0); 
}
/*
 * copy the file path into the pathstr variable.
 * pathstartptr must point to the start of the possible quoted string.
 * Example: pathstartptr ponting to "index.html"> will result in
 * pathstr being set to index.html with quotes removed.
 * pathstartptr ponting to xx.html>xxxx will result in pathstr beeing set to
 * xx.html
 */
void copy_file_path(char *pathstartptr){
	char *qptr,*dest;
	dest=pathstr;
	if (*pathstartptr == '"' || *pathstartptr == '\''){
		qptr=pathstartptr;
		pathstartptr++;
	}else{
		qptr=NULL;
	}
	while(*pathstartptr){
		if (qptr == NULL){
			/* wait for space or '>' */
			if (*pathstartptr== ' '||*pathstartptr == '>'){
				break;
			}
		}else{
			if (*pathstartptr==*qptr){
				/*found closing quot */
				break;
			}
		}
		*dest=*pathstartptr;
		dest++;
		pathstartptr++;
	}
	*dest='\0';
}
/*
 * check if link exists and print results
 */
int print_result(int l){
	char *dir;
	struct stat stbuf;
	char *chptr;
	int i,ckanchor,lnk;
	if (opt_n && str_list_is_substring_of(opt_n_list,pathstr)){
		/* ignore this link */
		goto ENDOFPRT;
	}
	tot_count++;
	lnk=is_abs(pathstr);
	if (lnk==5 && opt_F ==0){
		printf("%s:%d: %s=\"%s\" warning direct ref to file.\n",filename,l,typeoflinkstr[typeoflink],pathstr);
	}
	if (opt_a || opt_d || opt_f){
		if (lnk==3 && opt_a){
			printf("%s:%d: %s=\"%s\"\n",filename,l,typeoflinkstr[typeoflink],pathstr);
		}
		if(lnk==2 && opt_f){
			printf("%s:%d: %s=\"%s\"\n",filename,l,typeoflinkstr[typeoflink],pathstr);
		}
		if(opt_d && lnk==2){
			i=0;
			strappend(linkpath,opt_d,&i,MAXTAGLEN);
			strappend(linkpath,pathstr,&i,MAXTAGLEN);
			if (stat(linkpath,&stbuf)!=0){
				printf("%s:%d: %s=\"%s\" abs path is broken\n",filename,l,typeoflinkstr[typeoflink],linkpath);
				broken_count++;
			}
		}
		goto ENDOFPRT;
	}
	/* if opt_f or opt_a are set and we got until here then
	 * go to end and do not ckeck relative links */
	if (opt_f || opt_a) goto ENDOFPRT;
	if ((lnk==2 || lnk==3) && opt_w){
		/* warn about abs link to certain server */
		if (str_list_is_substring_of(opt_w_list,pathstr)){
			printf("%s:%d: warning abs link %s=\"%s\"\n",filename,l,typeoflinkstr[typeoflink],pathstr);
		}
		goto ENDOFPRT;
	}
	if (is_empty(pathstr)){
		/*warn about empty links*/
		printf("%s:%d: %s=\"%s\" is an empty link\n",filename,l,typeoflinkstr[typeoflink],pathstr);
		goto ENDOFPRT;
	}
	/* count relative links excluding empty ones */
	if (lnk<2) rel_count++;
	if (lnk==1){
		/* get ready for anchor checking ...*/
		/* this is a link of the type href=#something */
		i=0;
		strappend(linkpath,filename,&i,MAXTAGLEN);
		strappend(linkpath,pathstr,&i,MAXTAGLEN);
		/* generate the message that should be printed in 
		 * case the anchor does not exist */
#if defined USESNPRINTF
		snprintf(message,(size_t)MAXFIFO_STRLEN-1,"%s:%d: reference to non existing anchor %s=\"%s\"",filename,l,typeoflinkstr[typeoflink],pathstr);
#else //USESNPRINTF
		/* this is not secure but some unix systems have only sprintf */
		sprintf(message,"%s:%d: reference to non existing anchor %s=\"%s\"",filename,l,typeoflinkstr[typeoflink],pathstr);
#endif //USESNPRINTF
		message[MAXFIFO_STRLEN-1]='\0';
		/* add to the list of named anchors to check */
		add_to_fifo(0,linkpath,message);
		goto ENDOFPRT;
	}
	if (lnk>0){
		/* not a real rel link */
		goto ENDOFPRT;
	}
	dir=dirname(filename);
	/*construct the full path, Note: even if pathstr is
	 *a relative link it may still look like: ../info.html#sec1 
	 *We need to remove the #sec1 to get the file path and
	 *we need to check for a named anchor in that case. It is also 
         *possible to have cgi-bin's between the html 
         *pages: ../qer.pl?val=1
         */
	chptr=dir;
	i=0;ckanchor=0;
	while(*chptr && i < MAXTAGLEN){
		linkpath[i]=*chptr;
		i++;chptr++;
	}
	linkpath[i++]='/';
	chptr=pathstr;
	while(*chptr && i < MAXTAGLEN){
		linkpath[i]=*chptr;
		/* check for the first # */
		if (linkpath[i] == '#' && ckanchor==0){
			ckanchor=i;
			linkpath[i]='\0';
		}
                /* cgi-bin's between the html pages: */
		if (linkpath[i] == '?' ){
                        linkpath[i]='\0';
                }
		i++;chptr++;
	}
	linkpath[i]='\0';
	if (stat(linkpath,&stbuf)!=0){
		printf("%s:%d: %s=\"%s\" is broken\n",filename,l,typeoflinkstr[typeoflink],pathstr);
		broken_count++;
	}else{
		if (opt_O == 0 && (stbuf.st_mode & S_IROTH)==0){
			printf("%s:%d: warning link %s=\"%s\" non world readable file.\n",filename,l,typeoflinkstr[typeoflink],pathstr);
		}
		/* if this is a ref to a named anchor to another file...*/
		if (ckanchor && opt_A==0){
			if (opt_A && lnk!=1){
				/* add only ref to named anchors within
				 * this file */
				goto ENDOFPRT;
			}
			/* previously we had put a \0 where the # was
			 now we put it back */
			linkpath[ckanchor]='#';
			/* generate the message that should be printed in 
			 * case the anchor does not exist */
#if defined USESNPRINTF
			snprintf(message,(size_t)MAXFIFO_STRLEN-1,"%s:%d: reference to non existing anchor %s=\"%s\"",filename,l,typeoflinkstr[typeoflink],pathstr);
#else //USESNPRINTF
			sprintf(message,"%s:%d: reference to non existing anchor %s=\"%s\"",filename,l,typeoflinkstr[typeoflink],pathstr);
#endif //USESNPRINTF
			/* add to the list of named anchors to check */
			add_to_fifo(0,linkpath,message);
		}
	}
ENDOFPRT:
	return(0);
}
/* add the directory name of this file to the namedanchor. 
 * Example: the current file name is
 * "tmp/index.html" and we found <a name=joe></a>.
 * add_named_anchor_to_fifo("joe") will then return 
 * "tmp/index.html#joe" */
char *get_named_anchor_path(char *namedanchor){
	static char apath[MAXTAGLEN+1];
	char *dir,*c;
	int i;
	c=filename;
	/*construct the full path and add # */
	i=0;
	while(*c && i < MAXTAGLEN){
		apath[i]=*c;
		i++;c++;
	}
	apath[i++]='#';
	while(*namedanchor && i < MAXTAGLEN){
		apath[i]=*namedanchor;
		i++;namedanchor++;
	}
	apath[i]='\0';
	return(apath);
}
/* check for the type of tag, argument to findtag*/
int evaltag(char *wholetag,int linenumber,int is_anchor){
	char *pathstartptr;
	/*
	 * It is possible to have e.g an image inside an anchor.
	 * We must therefore alwasy check for anchors (name= and href=)
	 * and then for src...
	 * <a name="Top"><img src="image.jpg">Image</a> 
	 * A tag like <a name="xxxx" href="...."> .... </a> is also
	 * possible.
	 * It is not possible to have background together with an anchor.
	 * It is also not possible to have nested anchors
	 */
	if (matchpat(wholetag," href *= *",&pathstartptr)){
		copy_file_path(pathstartptr);
		/* 0 no match, 1 href, 2 src, 3 background */
		typeoflink=1;
		print_result(linenumber);
	}else if (matchpat(wholetag," background *= *",&pathstartptr)){
		copy_file_path(pathstartptr);
		typeoflink=3;
		print_result(linenumber);
	}
	/* now check for image inside anchor <a ...><img src...></a> */
	if (matchpat(wholetag," src *= *",&pathstartptr)){
		copy_file_path(pathstartptr);
		typeoflink=2;
		print_result(linenumber);
	}
	if (matchpat(wholetag," name *= *",&pathstartptr)){
		copy_file_path(pathstartptr);
		add_to_fifo_unless_there(2,get_named_anchor_path(pathstr),"");
		/* no print_result here */
	}
	return(0);
}
/* check if this is a named anchor , argument to findtag*/
int evaltag_name_only(char *wholetag,int linenumber,int is_anchor){
	char *pathstartptr;
	if (matchpat(wholetag," name *= *",&pathstartptr)){
		copy_file_path(pathstartptr);
		add_to_fifo_unless_there(2,get_named_anchor_path(pathstr),"");
	}
	return(0);
}
/* return the path component form a string of the form
 * path#something.*/
char *path_of_anchor(char *s){
	static char string[MAXTAGLEN+1];
	char *chptr;

	strncpy(string,s,MAXTAGLEN-1);
	string[MAXTAGLEN]='\0';
	chptr=strchr(string,(int)'#');
	if (chptr){
		*chptr='\0';
	}
	return(string);
}


int main(int argc, char *argv[])
{
	int optindstart;
	char *aref,*msg;
	/* The following things are used for getopt: */
        extern char *optarg;
        extern int optind;
        extern int opterr;
	int ch;

	opterr = 0;
	while ((ch = getopt(argc, argv, "Aad:FfhsOw:n:")) != -1) {
		switch (ch) {
		case 'A':
			opt_A=1;
			break;
		case 'F':
			opt_F=1;
			break;
		case 'a':
			opt_a++;
			break;
		case 'd':
			opt_d=(char *)malloc(strlen(optarg)+1);
			strcpy(opt_d,optarg);
			break;
		case 'f':
			opt_f=1;
			break;
		case 's':
			opt_s=1;
			break;
		case 'O':
			opt_O=1;
			break;
		case 'n':
			opt_n_list=string_to_list(optarg);
			opt_n=1;
			break;
		case 'w':
			opt_w_list=string_to_list(optarg);
			opt_w=1;
			break;
		case 'h':
			help(); /*no break, help does not return */
		case '?':
			fprintf(stderr, "ERROR: No such option. -h for help.\n");
			exit(1);
		/*no default action for case */
		}
	}
	if (opt_f || opt_a) opt_s=0;
	
	if (optind == argc){
		help();
	}
	init_fifo_class(0); /* the references to named anchors */
	init_fifo_class(1); /* files already checked */
	init_fifo_class(2); /* the named anchors that were found  */
	optindstart=optind;
	while(optind<argc){
		/* put the file names that will be anyhow checked 
		 * into this buffer (this is for named anchor checks) */
		add_to_fifo_unless_there(1,argv[optind],"");
		optind++;
	}
	optind=optindstart;
	/* we search for:
	 * _HREF="http://www.xxx/" _SRC="xxxx" (the _ is a space)
	 * _BACKGROUND="xxxx" _NAME="yyyy"
	 * Note: we search for it only inside < ... > and not outside 
	 * Some broken html pages use however "<" instead of &lt; therefore
	 * findtag reset the state after MAXTAGLEN characters */
	while(optind<argc){
		/* search for html tags and call the function evaltag */
		filename=argv[optind];
		findtag(evaltag,filename,1);
		optind++;
	}
	/* now we must look after our named anchors */
	while(read_out_of_fifo(0,&aref,&msg)){
		/* did we already read this file ?*/
		filename=path_of_anchor(aref);
		if (is_in_fifo(1,filename)){
			/* check existance of anchor */
			if (!is_in_fifo(2,aref)){
				puts(msg);
				broken_count++;
			}
		}else{
			/* file not read yet */
			if (opt_A==0){
				add_to_fifo(1,filename,"");
				findtag(evaltag_name_only,filename,1);
				/* check existance of anchor */
				if (!is_in_fifo(2,aref)){
					puts(msg);
					broken_count++;
				}
			}
		}
	}
	if (opt_s){
		printf("\n| Total number of all (abs+rel) links: %d\n| Number of relative links: %d\n| Number of broken relative links: %d\n",tot_count,rel_count,broken_count);
	}
	return(0);
}
