#!/usr/bin/perl
$version = "v0.91";
#
# MK CHECKLINKS v0.91 - 98-10-26
# by Michael Khler (mikoehler@vossnet.de)
# Bandkampsweg 2
# D-22459 Hamburg / Germany
# Tel: +(49 or German area code) 40-75492299 (Answering machine)
# Fax: 				 40-75492289 (14400 bsp)
#
# language: PERL 5 (Version 5.004_04)
#
#
# 	SEE README and CHANGES for more details
# 	=======================================
#
# USERS
# =====
# The autor of this script wants to receive $45, if u use it
# in any commercial circumstances.
#
# Request a license from mikoehler@vossnet.de before using
# in public or commercial environment.
#
# NON-PROFIT organisations receive a free licence!.
# 
# You can set up your own @IGNORELIST here
# =========================================
# @IGNORELIST    = ('MAILTO:','PORN.JPG','BLIND.GIF','JAVASCRIPT');

# default ignorelist 
@IGNORELIST    = ('MAILTO:','JAVASCRIPT');

$fetch_hdr_cmd = "lynx -source -head";
$fetch_src_cmd = "lynx -source";
@LINES         = ();
@HTTPLINKS     = ();
@RESLINKS      = ();
@LINKPOS       = ('HREF="','SRC="');

# some vars
###########

# chkmode indicate the actual link checking mode , can be file or http
$chkmode       = 'null';
# chkmode indicate the actual link checking style , can be loss or strict
$checkStyle    = 'loss';

# compare strings
sub cstr{
my $a = uc($_[0]);
my $b = uc($_[1]);
  return (index($a,$b)==0);
}

# index strings
sub cidx{
my $a = uc($_[0]);
my $b = uc($_[1]);
   return (index($a,$b)!=-1);
}


# Checking Link, used by checkLink
sub chkL{
my $link = $_[0];
my @cmd  = `$fetch_hdr_cmd $link`;
my $l = $cmd[0];
#print "L:$link\n";
#print "L:@cmd\n\n";
  # checking HTML header return codes and return boolean value
  if ($checkStyle eq "loss") {
    return ((index($l,"HTTP/")==0) && ((index($l,"200")==9) || (index($l,"302")==9)));
  } elsif ($checkStyle eq "strict") {
    return ((index($l,"HTTP/")==0) && (index($l,"200")==9));
  } else { die 'Variable \$checkStyle must be "loss" or "strict"' };
}

# Checking Link
sub checkLink{
my $link = $_[0];
   if (chkL($link)){
     print "     OK";
   } else {
     print "INVALID";
   }
   print " $link\n";
}

# cuts the Http Reference from a string
sub cutHREF{
my $l = $_[0];
   $l = substr($l,index(uc($l),'HREF="')+6,255);
   $l = substr($l,0,index($l,'"'));
   return $l;
}

# check if a link ref. is in the given string
sub hasLink{
my $l  = $_[0];
my $l2 = '';
   if (length($l)==0) {return false;}
   foreach(@LINKPOS){
     $l2 = $_;
     if (cidx($l,$_)){
        return true; 
     }
   }
   return false;
}

# count the HTTP references
sub countHTTPlinks{
my $x=0;
  foreach(@HTTPLINKS){ 
    $x=$x+1;
   }
   return $x;
}

# count the Resource references
sub countRESlinks{
my $x=0;
  foreach(@RESLINKS){ 
    $x=$x+1;
   }
   return $x;
}

# return pos. of HTTP ref.
sub LhttpPos{
my $l = uc($_[0]); return index($l,@LINKPOS[0]);
}

# return pos. of resource ref.
sub LresPos{
my $l = uc($_[0]); return index($l,@LINKPOS[1]);
}

# check if HTTP ref. or not
sub isHTTP{
my $l = uc($_[0]); 
  if (LhttpPos($l)==-1) { return false };
  if (LresPos($l)==-1) { return true };
  if (LhttpPos($l) < LresPos($l)) { return true;} else {return false;}
}



# extract links from a line
sub extractLinks{
  sub ignoreLink{
    foreach (@IGNORELIST){
      if (cidx(uc($_[0]),$_)){
        return true;
      }
    }
    return false;
  }
  sub getHost{
  my $host = $ARGV[0];
    $host = substr($host,7,255);
    return "http://".substr($host,0,index($host,"/"));
  }

my $l  = $_[0];
my $l2 = '';
  while (hasLink($l) eq true){
    if (isHTTP($l) eq true) {
      $l2 = substr($l,index(uc($l),@LINKPOS[0])+length(@LINKPOS[0]),255);
      $l  = substr($l2,index($l2,'"')+1,255);
      $l2 = substr($l2,0,index($l2,'"'));
      if (ignoreLink($l2) eq false) { 
        if ( ($chkmode eq "http") && (cidx($l2,"http://")==false) ){
          if ($l2[0] ne "/"){ $l2 = "/$l2"; }
          $l2 = getHost.$l2;
#	  print "l2:$l2\n";
        }
        push @HTTPLINKS , $l2; 
      }
    } else {
      $l2 = substr($l,index(uc($l),@LINKPOS[1])+length(@LINKPOS[1]),255);
      $l  = substr($l2,index($l2,'"')+1,255);
      $l2 = substr($l2,0,index($l2,'"'));
      if (ignoreLink($l2) eq false) { 
        if ( ($chkmode eq "http") && (cidx($l2,"http://")==false) ){
          if ($l2[0] ne "/"){ $l2 = "/$l2"; }
          $l2 = getHost.$l2;
#	  print "l2:$l2\n";
        }
        push @RESLINKS , $l2; 
      }
    }
  }
}

# number of total resources in current list
sub totalRes{
my  $total = 0;
    foreach (@RESLINKS){
      $total = $total+1;
    }
    return $total;
}

# remove same entrys in resource list
sub RmvResDup{
my $idxA = 0;
my $idxB = 0;
   do {
      do { 
           if (@RESLINKS[$idxA] == @RESLINK[$idxB]){
           splice(@RESLINKS,$idxB,1);
	}
	$idxB=$idxB+1;
      } until ($idxB>=totalRes);
    $idxA=$idxA+1;
    $idxB=$idxA+1;
   } until ($idxA>=totalRes);
}

# process checking links
sub procLinks{
my $x=0;
my $lnkp = $ARGV[0];
   
   # Then check links
   foreach (@LINES){
     $l = $_;
     chop($l);
     extractLinks($l);
   }

   RmvResDup;

   print "Processing ".countHTTPlinks." HTTP and ".countRESlinks." Resource links\n";
   
   foreach (@HTTPLINKS){
     checkLink($_);
#     print "$_\n";
  }

   foreach (@RESLINKS){
     checkLink($_);
#     print "$_\n";
   }
}

# process a local file with links
sub procFile{
my $f = $_[0];
   $chkmode = "file";
   open (DATA, "<$f") || failure("File: $f not found");
   @LINES=<DATA>;
   close(DATA);
   procLinks;
}

# yes, this code is terrible..

# process a http file with links
sub procHTTP{
my $f = $_[0];
   $chkmode = "http";
   print "Fetching HTTP file (please stand by)\n";
   @LINES = `$fetch_src_cmd $f`;
   procLinks;
}

# failure procedure
sub failure{
   print "\nFAILURE\n$_[0]\nAborting..\n";
   exit;
}

sub noArgs{
  print "MKClinks $version (c) M.Khler <mikoehler\@vossnet.de>\n\n";
  print "usage: mkclinks <file or http reference> [--strict]\n";
  print "   ex: mkclinks http://www.domain.com/links.html\n";
  print "       mkclinks /usr/local/httpd/htdocs/links.html\n\n";
  print "Give a local http file or a web reference\n";
  print "as argument to mkclings for link checking.\n\n";
  print "May define --strict, for stricter link checking\n\n";
  exit 0
}

# main routine
sub mymain{
   if (cidx($ARGV[1],"strict")){
     $checkStyle = 'strict';
   }
   if (cidx($ARGV[0],"http://")){
     procHTTP($ARGV[0]);
   } elsif ($ARGV[0] gt " ") {
     procFile($ARGV[0]);
   } else { noArgs }
   print "\nThank you for using MKCheckLinks $version from mikoehler\@vossnet.de\n";
}

mymain;