Adding hype to programs

Introduction

This is supposed to take program sources and convert them to something that Netscape will show nicely. Comments, character strings, and pre-processor directives are shown using the emphasis font, and keywords using the strong font.

You can, if you wish, bracket selected areas of the source between lines containing <doc> and </doc>. Such lines are assumed to contain reasonable HTML text, and will be output "as-is". As a bonus, the characters before the <doc> are remembered and will be removed from the bracketted lines, which means you can even use "here to end-of-line" style comments for your documentation. (The bracket lines, by the way, are suppressed.)

The other pair of bracket lines recognised is <hide>...</hide> which caused the marked text to be quietly dropped from sight.

#include	<assert.h>
#include	<ctype.h>
#if defined (__MSDOS__)
#include	<io.h>
#endif
#include	<limits.h>
#include	<stdio.h>
#include	<stdlib.h>
#include	<string.h>
#if defined (__unix)
#include	<unistd.h>
#endif


Low Level Routines

First come the basic output routines for program text. The first, char_out deals with the problems of converting HTML meta-characters into entity references. The second, str_out, just calls char_out in a loop. The third, tag_out, is used for changing type face as and when necessary.

static int quiet;

static void char_out (int ch)
{
  if (quiet)
    return;

  switch (ch)
  {
  case '<':	fputs ("&lt;", stdout);		break;
  case '>':	fputs ("&gt;", stdout);		break;
  case '&':	fputs ("&amp;", stdout);	break;
  default:	putchar (ch);			break;
  }
}

static void str_out (char *s)
{
  while (*s)
    char_out (*s++);
}

static void stag_out (char *s)
{
  if (quiet)
    return;

  putchar ('<');
  fputs (s, stdout);
  putchar ('>');
}

static void etag_out (char *s)
{
  int ch;
  if (quiet)
    return;

  putchar ('<');
  putchar ('/');
  while ((ch = *s++) > ' ')
    putchar (ch);
  putchar ('>');
}

Now a handful of routines to help classify the possible characters.

enum
{
  id_char	= 0x01,		/* char occurs in identifiers */
  space_char	= 0x02,		/* char is some type of white space */
  punct_char	= 0x04,		/* operators, brackets, and the like */
  quote_char	= 0x08,		/* start or end strings */
  escape_char	= 0x10,		/* "hide" a quote inside a string */
  nul_char	= 0x20		/* char is NUL */
};


static char ch_type [UCHAR_MAX + 1];

static void init_from_string (char *p, int t)
{
  unsigned char *q = (unsigned char *) p;
  int ch;
  while ((ch = *q++) != 0)
    if (ch > ' ')
      ch_type [ch] |= t;
}

static int type_of (int ch)
{
  return (ch_type [(unsigned char) ch]);
}

static void init_types (void)
{
  int i;

  ch_type [0] = nul_char;

  for (i = 1; i <= ' '; i += 1)
    ch_type [i] = space_char;

  ch_type [127] = space_char;

We can usually assume letters and digits are allowed in an identifier

  init_from_string ("abcdefghijklmnopqrstuvwxyz", id_char);
  init_from_string ("ABCDEFGHIJKLMNOPQRSTUVWXYZ", id_char);
  init_from_string ("0123456789", id_char);
}

static void set_punct (void)
{
  int i;
  
  for (i = 0; i <= UCHAR_MAX; i += 1)
    if ((ch_type [i] & ~escape_char) == 0)
      ch_type [i] |= punct_char;
}


A wrapper around malloc() or realloc() calls to save a lot of testing later.

static void *ensure (void *p)
{
  if (! p)
  {
    fprintf (stderr, "Out of memory\n");
    exit (1);
  }
  return (p);
}


A bit of string cleaning:

static char *trim (char *s)
{
  if (s)
  {
    {
      /* ltrim */
      int ch;
      while ((ch = *s) != '\0' && ch <= ' ')
	s += 1;
    }
    {
      /* rtrim */
      size_t len = strlen (s);
      while (len && s [--len] <= ' ')
	s [len] = '\0';
    }
  }
  return (s);
}

Now for the lowest level input routines: get_line and push_back. The get_line routine simply calls fgets a few times until it either has a complete line or it exceeds some random limit (currently 10K). It reads into a malloced buffer, which it frees next time around. You can, of course, push_back a tail of this buffer which will look like the next line to be retrieved. This greatly simplifies a lot of the code later.

static char *pushed_back;
static FILE *in_file;
static char *kw_name;

static void push_back (char *p)
{
  assert (! pushed_back);
  pushed_back = p;
}

static char *get_line (void)
{
  char *res = pushed_back;
  pushed_back = NULL;
  if (res)
    return (res);
  else
  {
    static char *line_buf;
    size_t line_len = 0;
    if (line_buf)
      free (line_buf);
    line_buf = NULL;
    while (line_len < 10 * 1024)
    {
      size_t old_len = line_len;
      char *p;
      line_buf = ensure (realloc (line_buf, line_len += 1024));
      p = line_buf + old_len;
      if (! fgets (p, line_len - old_len, in_file))
        return (old_len ? line_buf : NULL);
      old_len = strlen (p);
      if (! old_len)
        break;
      p += old_len - 1;
      if (*p == '\n')
      {
        *p = '\0';
	break;
      }
      line_len = (size_t) (p - line_buf) + 1;
    }
    return (line_buf);
  }
}



Token handling

The next_token routine assumes that t_end points to the first character worth considering for fetching a new token. This is usually where the previous token ended, hence the name, but can also be the first character of a new line.

The steps are:

static char *t_start, *t_end, *prefix;

static int next_token (void)
{
  int this_type;

  t_start = t_end;
  while ((this_type = type_of (*t_start)) == space_char)
    char_out (*t_start++);

  if (this_type == nul_char)
    return (nul_char);
  
  t_end = t_start;
  if (this_type == quote_char)
  {
    int q = *t_start;
    for (;;)
    {
      int ch = *++t_end;
      if (ch == '\0')
        break;
      else if (ch == q)
      {
        t_end += 1;
	break;
      }
      else if ((type_of (ch) & escape_char) && t_end [1])
        t_end += 1;
    }
  }
  else
  {
    int mask = (this_type & (id_char | punct_char));
    while (type_of (*++t_end) & mask)
      ;
  }
  return (this_type);
}




Keyword handling

Before an identifier is output it's checked to see if it's a keyword, so that the type style can be changed if necessary. The keywords are read from an initialisation file and stored in a traditional linked list.

If the user requires that variable spelling be standardised, I also keep a list of variables that have been seen. Hopefully, this will not be a common occurence, so I feel justified in "cheating" here.

typedef struct keyword
{
  char		 *spelling;
  size_t	  len;
  struct keyword *next;
} keyword;

static keyword *kw_base;
static keyword *var_base;

typedef int (*compare_fn) (const char *a, const char *b, size_t len);
static compare_fn kwcomp = (strncmp);

A non-ANSI string compare function that comes in handy

#if defined (__MSDOS__)
static compare_fn nocase_cmp = (strnicmp);
#elif defined (__unix)
static int nocase_cmp (const char *a, const char *b, size_t len)
{
  int res = 0;
  while (res == 0 && (*a || *b) && len--)
    res = *a++ - *b++;
  return (res);
}
#endif

static char *find_keyword (void)
{
  keyword *p = kw_base;
  size_t len = (size_t) (t_end - t_start);
  
  while (p)
  {
    if (len == p -> len && kwcomp (p -> spelling, t_start, len) == 0)
      return (p -> spelling);
    p = p -> next;
  }
  return (NULL);
}

static char *add_keyword (void)
{
  char *spelling = find_keyword ();
  if (! spelling)
  {
    size_t len = (size_t) (t_end - t_start);
    keyword *p = ensure (malloc (sizeof (keyword)));
    spelling = ensure (malloc (len));
    p -> next = kw_base;
    p -> len = len;
    memcpy (p -> spelling = spelling, t_start, len);
    kw_base = p;
  }
  return (spelling);
}

The keywords section of the initialisation file is simply a list of identifiers. These are read (by calling next_token) and stored away in the list, until we get either EOF or something starting '[' (which we'll take as meaning "end-of-section"). Comments start, as usual, with a semicolon. Unusually, this must be in the first column to allow for the odd cases where you really need a semicolon in your keywords.

static void read_keywords (void)
{
  while ((t_end = get_line ()) != NULL)
    if (*t_end != ';')
    {
      int this_type;
      while ((this_type = next_token ()) != nul_char)
      {
	if (*t_start == '[')
	  return;
	if (this_type == id_char)
	  add_keyword ();
      }
    }
}



Typeface Switching

At various points in what follows we'll be changing the "typeface" between normal, bold (actually <strong>), and italic (<em>). In fact, whenever we want bold we'll call set_face(bold), no matter what was selected before. It's up to set_face() to remember what we've got, how to turn it off, and whether to supress redundant switches.

enum
{
  normal,
  italics,
  bold
};

static char *strong, *em, *pre;

static void set_face (int new_face)
{
  static int old_face;
  if (new_face == old_face)
    return;
  switch (old_face)
  {
  case normal:					break;
  case italics:		etag_out (em);		break;
  case bold:		etag_out (strong);	break;
  }
  switch (old_face = new_face)
  {
  case normal:					break;
  case italics:		stag_out (em);		break;
  case bold:		stag_out (strong);	break;
  }
} 



High level output

Now come the routines that do the output, depending on what mode we happen to find ourselves in. At the start, we're not_decided, since we don't know if the first non-hidden line will be code or documentation.

static enum
{
  not_decided,
  hiding,
  doing_code,
  doing_doc
} mode_now, prev_mode;

static int comment_depth;


The two check_... routines are called to make sure the line we're working on isn't one of the magic marker lines. If it is magic, we do a mode-switch and drop the line.

static int check_hidden (void)
{
  if (strstr (t_start, "<" "hide" ">"))
  {
    prev_mode = mode_now;
    mode_now = hiding;
    return (1);
  }
  else
    return (0);
}

static int hide_code;

static int check_doc (void)
{
  char *end_here = strstr (t_start, "<" "doc" ">");
  if (end_here)
  {
    size_t pre_len = (size_t) (end_here - t_start);
    if (prefix)
      free (prefix);
    prefix = ensure (malloc (1 + pre_len));
    if (pre_len)
      memcpy (prefix, t_start, pre_len);
    prefix [pre_len] = '\0';
    set_face (normal);
    if (mode_now == doing_code && ! hide_code)
      etag_out (pre);
    mode_now = doing_doc;
    return (1);
  }
  else
    return (0);
}

Comments are particularly nasty. Some languages allow them to nest, others don't. Some comments are delimited at both ends, some just extend to the end-of-line marker. And some languages have multiple types of comment marker. Looks like another job for linked lists.


typedef struct comment
{
  char		 *start;
  char		 *end;
  int		  nestable;
  struct comment *next;
} comment;

static comment *comment_base, *this_comment;

static void do_comments (void)
{
  char *end_on = this_comment -> end;
  char *end_at = strstr (t_start, end_on);

  if (end_at)
    end_at += strlen (end_on);

  if (this_comment -> nestable)
  {
    char *start_on = this_comment -> start;
    char *start_at = strstr (t_start, start_on);
    if (start_at)
    {
      start_at += strlen (start_on);
      if (end_at == NULL || end_at > start_at)
      {
        comment_depth += 2;	/* we'll subtract 1 again later ... */
	end_at = start_at;
      }
    }
  }
  if (end_at)
  {
    while (t_start < end_at)
      char_out (*t_start++);
    comment_depth -= 1;		/* told you so! */
    push_back (t_start);
  }
  else
  {
    str_out (t_start);
    putchar ('\n');
  }
}

Here's where we check for a start-of-comment. Check the sequence of characters we're just about to output, hoping that it matches one of the "open comment" patterns. If so, emit the line (in italics) if it's a "rest-of-line" style comment, or push back the comment text and start a "block comment". If it's not a comment, return 0 and it'll get output as code.

static int start_comment (void)
{
  comment *p;
  for (p = comment_base; p; p = p -> next)
  {
    size_t len = strlen (p -> start);
    if (t_end - t_start >= len && memcmp (t_start, p -> start, len) == 0)
    {
      set_face (italics);
      str_out (p -> start);
      t_start += len;
      if (p -> end)
      {
        comment_depth = 1;
	this_comment = p;
	push_back (t_start);
      }
      else
      {
        str_out (t_start);
	putchar ('\n');
      }
      return (1);
    }
  }
  return (0);
}

The add_comment() routine puts things into the list, according to what gets found in the initialisation file.


static void add_comment (char *start, char *end, int nest)
{
  comment *p = ensure (malloc (sizeof (comment)));
  size_t len = strlen (start);
  p -> start = ensure (malloc (len + 1));
  strcpy (p -> start, start);
  if (end)
  {
    len = strlen (end);
    p -> end = ensure (malloc (len + 1));
    strcpy (p -> end, end);
  }
  else
    p -> end = 0;
  p -> nestable = nest;
  p -> next = comment_base;
  comment_base = p;
}


Output the identifier in the preferred style of spelling: upper-case, lower-case, standardised, or as found in the source file.


static void correct_spelling (int face, char *std, int type)
{
  set_face (face);
  while (t_start < t_end)
  {
    switch (type)
    {
    case 'u':	putchar (toupper (*t_start));	break;
    case 'l':	putchar (tolower (*t_start));	break;
    case 's':	putchar (*std++);		break;
    default:	putchar (*t_start);		break;
    }
    t_start += 1;
  }
}

static int kw_fold, var_fold;

static void print_id (void)
{
  char *p = find_keyword ();
  if (p)
    correct_spelling (bold, p, kw_fold);
  else
  {
    if (var_fold == 's')
    {

If we've been asked to standardise variable spellings we've got to keep a list of what we've seen so far. This is going to make things run very slowly if we've got millions of variables defined.

      keyword *keep_kw_base = kw_base;
      kw_base = var_base;
      p = add_keyword ();
      var_base = kw_base;
      kw_base = keep_kw_base;
    }
    else
      p = t_start;
    correct_spelling (normal, p, var_fold);
  }
}


The processing for a line of code: emit the tokens one by one until told otherwise.



static void do_code_line (void)
{
  int this_type;
  if (check_hidden () || check_doc () || hide_code)
    return;

  if (comment_depth)
  {
    do_comments ();
    return;
  }

  while ((this_type = next_token ()) != nul_char)
  {
    switch (this_type)
    {
    case id_char:
      print_id ();
      break;
    case quote_char:
      set_face (italics);
      while (t_start < t_end)
        char_out (*t_start++);
      break;
    case punct_char:
      if (start_comment ())
        return;
      set_face (bold);
      char_out (*t_start++);
      t_end = t_start;
      break;
    }
  }
  char_out ('\n');
}

The processing for a line of documentation: if we're not ending the doc just print the line, remembering to remove anything it might have in common with the "start doc" line.

static void do_doc_line (void)
{
  if (check_hidden ())
    return;
  if (strstr (t_start, "<" "/doc" ">"))
  {
    if (! hide_code)
      stag_out (pre);
    mode_now = doing_code;
    if (comment_depth)
      set_face (italics);
    return;
  }
  if (prefix)
  {
    char *p = prefix;
    while (*p && *p == *t_start)
    {
      p += 1;
      t_start += 1;
    }
  }
  fputs (t_start, stdout);
  putchar ('\n');
}

And, of course, hiding a line is simplest of all.

static void do_hiding_biz (void)
{
  if (strstr (t_start, "<" "/hide" ">"))
    mode_now = prev_mode;
}


This is the handler for "don't know" mode. If the line isn't the start of documentation, check to see if there's a token on it. If there is, push back the whole line and read it again as code.


static void decide_then (void)
{
  char *start_here = t_start;

  if (check_hidden ())
    return;
  if (check_doc ())
  {
    quiet = 0;
    return;
  }
  if (next_token () != nul_char)
  {
    quiet = 0;
    push_back (start_here);
    mode_now = doing_code;
    if (! hide_code)
      stag_out (pre);
  }
}


Are two strings "the same", assuming blanks and case are not significant?


static int matching_strings (char *a, char *b)
{
  for (;;)
  {
    while (*a && *a <= ' ')
      a += 1;
    while (*b && *b <= ' ')
      b += 1;
    if (tolower (*a) != tolower (*b))
      return (0);
    if (! *a)
      return (1);
    a += 1;
    b += 1;
  }
}

Is one string a prefix of the other, in the style of matching_strings() above?

static int is_prefix (char *a, char *b)
{
  for (;;)
  {
    while (*a && *a <= ' ')
      a += 1;
    while (*b && *b <= ' ')
      b += 1;
    if (! *b)
      return (1);
    if (tolower (*a) != tolower (*b))
      return (0);
    a += 1;
    b += 1;
  }
}



The initialisation file contains sections delimited by headings wrapped in [...] characters. For each language I'll be interested in the [<lang> Characters] and [<lang> Keywords] sections. This is where we do the looking.


static void find_section (char *section_name)
{
  char *line;
  fseek (in_file, 0, SEEK_SET);
  while ((line = get_line ()) != NULL)
    if (matching_strings (line, section_name))
      return;
  fprintf (stderr, "Can't find %s section in %s\n", section_name, kw_name);
  exit (1);
}

Here's a horrible routine.

Assuming we're positioned at the right section, read a collection of lines of the form param=val and, if the param is recognised, process the val in some way. The params understood so far are:

quote or quotes
The val is a collection of string quote characters.
letter or letters
The val is a collection of characters to be considered as letters when looking for identifiers.
escape or escapes
The val is a collection of characters which will cause an immediately following string quote character not to terminate a string.
keyword case
The val should be one of lower, upper, standard, or -. These will cause all keywords to be output in, respectively, lowercase, uppercase, as defined in the initialisation file, or as found in the source.
variable case
This expects the same val arguments as keyword case, and has similar effects on those identifiers that seem not to be keywords. In the case of standard, of course, there is no list in the initialisation file to use, so the first occurence in the source file is taken as the definitive reference. Please don't use this.
comment or comments
There are three forms of this:
comment=;
a single token means that the comment extends to the end of the line
comment=*-- --*
a pair of a tokens defines a block-style comment
comment=(* *) nested
adding the word nested indicates that block-style comments can be nested one within another.

Setting any value for variable case or keyword case indicates that the language is not case sensitive.

static void read_setup (void)
{
  char *line;
  char *equals;
  while ((line = trim (get_line ())) != NULL)
  {
    if (*line == '[')
      return;
    equals = strchr (line, '=');
    if (equals)
    {
      static struct kwvals
      {
        char *kw;
	int   val;
      } params [] =
      {
        {"quote=",	quote_char},
        {"quotes=",	quote_char},
        {"letter=",	id_char},
        {"letters=",	id_char},
        {"escape=",	escape_char},
        {"escapes=",	escape_char},
	{NULL,		0}
      };
      struct kwvals *p;
      
      equals = trim (equals + 1);
      if (is_prefix (line, "keyword case="))
      {
        switch (kw_fold = tolower (*equals))
	{
	case 's': case 'u': case 'l':			break;
	default:			kw_fold = '-';	break;
	}
      }
      else if (is_prefix (line, "variable case="))
      {
        switch (var_fold = tolower (*equals))
	{
	case 's': case 'u': case 'l':			break;
	default:			var_fold = '-';	break;
	}
      }
      else if (is_prefix (line, "comment=") || is_prefix (line, "comments="))
      {
        char *end = equals + strlen (equals);
        char *close_with = equals;
	while (*close_with > ' ')
	  close_with += 1;
	if (*close_with == '\0')
	  add_comment (equals, NULL, 0);
	else
	{
	  *close_with++ = '\0';
	  close_with = trim (close_with);
	  end = close_with;
  	  while (*end > ' ')
	    end += 1;
	  *end++ = '\0';
	  add_comment (equals, close_with, *end);
	}
      }
      else
	for (p = params; p -> kw; p += 1)
	  if (is_prefix (line, p -> kw))
	  {
	    init_from_string (equals, p -> val);
	    break;
	  }
    }
  }
}


A marginally less horrible routine. Here we load the [General] section, which defines the HTML tags to use.

static void read_general (void)
{
  char *line;
  char *equals;
  while ((line = trim (get_line ())) != NULL)
  {
    if (*line == '[')
      return;
    equals = strchr (line, '=');
    if (equals)
    {
      static struct kwvals
      {
        char  *kw;
        char **val;
      } params [] =
      {
        {"keyword tag=",	&strong},
        {"comment tag=",	&em},
        {"code tag=",		&pre},
	{NULL,			NULL}
      };
      struct kwvals *p;
      
      equals = trim (equals + 1);
      for (p = params; p -> kw; p += 1)
	if (is_prefix (line, p -> kw))
	{
	  char **addr = p -> val;
	  if (*addr)
	    free (*addr);
	  *addr = ensure (malloc (1 + strlen (equals)));
	  strcpy (*addr, equals);
	  break;
	}
    }
  }
}


This is where we make the decision as to which language we're using and whether we're supressing all but the documentation text.

static int init_language (int argc, char **argv)
{
  if (argc > 1 && strcmp (argv [1], "-d") == 0)
  {
    argc -= 1;
    argv += 1;
    hide_code = 1;
  }

  if (argc != 2)
    return (0);

  if ((in_file = fopen ((kw_name = argv [1]), "r")) == NULL)
  {
    fprintf (stderr, "Can't open '%s'\n", kw_name);
    exit (1);
  }

  find_section ("[General]");
  read_general ();
  if (! em)
    em = "em";
  if (! pre)
    pre = "pre";
  if (! strong)
    strong = "strong";

  find_section ("[Characters]");
  read_setup ();
  if (kw_fold || var_fold)
    kwcomp = nocase_cmp;
  find_section ("[Keywords]");
  read_keywords ();
  fclose (in_file);
  return (1);
}


The main routine

Given the above, the main routine is quite simple. Do a bit of set-up, initialise according to language, then read stdin and process each line accordingly.

int main (int argc, char **argv)
{
  quiet = 1;
  init_types ();
  if (isatty (fileno (stdin)) || ! init_language (argc, argv))
  {
    fprintf (stderr, "Usage: %s [-d] kw-file <source >dest.html\n", argv [0]);
    exit (1);
  }

  set_punct ();
  in_file = stdin;
  while ((t_start = t_end = get_line ()) != NULL)
  {
    switch (mode_now)
    {
    case doing_code:	do_code_line ();	break;
    case doing_doc:	do_doc_line ();		break;
    case hiding:	do_hiding_biz ();	break;
    case not_decided:	decide_then ();		break;
    }
  }
  if (mode_now == hiding)
    mode_now = prev_mode;
  switch (mode_now)
  {
  case not_decided:
    fprintf (stderr, "Nothing useful done\n");
    return (1);
  case doing_code:
    set_face (normal);
    if (! hide_code)
    {
      etag_out (pre);
      putchar ('\n');
    }
  default:
    break;
  }
  return (0);
}