This is supposed to take program sources and convert them to something that Netscape will show nicely. Comments, character strings, and pre-processor directives are shown using the emphasis font, and keywords using the strong font.
You can, if you wish, bracket selected areas of the source between lines containing <doc> and </doc>. Such lines are assumed to contain reasonable HTML text, and will be output "as-is". As a bonus, the characters before the <doc> are remembered and will be removed from the bracketted lines, which means you can even use "here to end-of-line" style comments for your documentation. (The bracket lines, by the way, are suppressed.)
The other pair of bracket lines recognised is <hide>...</hide> which caused the marked text to be quietly dropped from sight.
#include <assert.h> #include <ctype.h> #if defined (__MSDOS__) #include <io.h> #endif #include <limits.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #if defined (__unix) #include <unistd.h> #endif
First come the basic output routines for program text. The first, char_out deals with the problems of converting HTML meta-characters into entity references. The second, str_out, just calls char_out in a loop. The third, tag_out, is used for changing type face as and when necessary.
static int quiet;
static void char_out (int ch)
{
if (quiet)
return;
switch (ch)
{
case '<': fputs ("<", stdout); break;
case '>': fputs (">", stdout); break;
case '&': fputs ("&", stdout); break;
default: putchar (ch); break;
}
}
static void str_out (char *s)
{
while (*s)
char_out (*s++);
}
static void stag_out (char *s)
{
if (quiet)
return;
putchar ('<');
fputs (s, stdout);
putchar ('>');
}
static void etag_out (char *s)
{
int ch;
if (quiet)
return;
putchar ('<');
putchar ('/');
while ((ch = *s++) > ' ')
putchar (ch);
putchar ('>');
}
Now a handful of routines to help classify the possible characters.
enum
{
id_char = 0x01, /* char occurs in identifiers */
space_char = 0x02, /* char is some type of white space */
punct_char = 0x04, /* operators, brackets, and the like */
quote_char = 0x08, /* start or end strings */
escape_char = 0x10, /* "hide" a quote inside a string */
nul_char = 0x20 /* char is NUL */
};
static char ch_type [UCHAR_MAX + 1];
static void init_from_string (char *p, int t)
{
unsigned char *q = (unsigned char *) p;
int ch;
while ((ch = *q++) != 0)
if (ch > ' ')
ch_type [ch] |= t;
}
static int type_of (int ch)
{
return (ch_type [(unsigned char) ch]);
}
static void init_types (void)
{
int i;
ch_type [0] = nul_char;
for (i = 1; i <= ' '; i += 1)
ch_type [i] = space_char;
ch_type [127] = space_char;
We can usually assume letters and digits are allowed in an identifier
init_from_string ("abcdefghijklmnopqrstuvwxyz", id_char);
init_from_string ("ABCDEFGHIJKLMNOPQRSTUVWXYZ", id_char);
init_from_string ("0123456789", id_char);
}
static void set_punct (void)
{
int i;
for (i = 0; i <= UCHAR_MAX; i += 1)
if ((ch_type [i] & ~escape_char) == 0)
ch_type [i] |= punct_char;
}
A wrapper around malloc() or realloc() calls to save a lot of testing later.
static void *ensure (void *p)
{
if (! p)
{
fprintf (stderr, "Out of memory\n");
exit (1);
}
return (p);
}
A bit of string cleaning:
static char *trim (char *s)
{
if (s)
{
{
/* ltrim */
int ch;
while ((ch = *s) != '\0' && ch <= ' ')
s += 1;
}
{
/* rtrim */
size_t len = strlen (s);
while (len && s [--len] <= ' ')
s [len] = '\0';
}
}
return (s);
}
Now for the lowest level input routines: get_line and push_back. The get_line routine simply calls fgets a few times until it either has a complete line or it exceeds some random limit (currently 10K). It reads into a malloced buffer, which it frees next time around. You can, of course, push_back a tail of this buffer which will look like the next line to be retrieved. This greatly simplifies a lot of the code later.
static char *pushed_back;
static FILE *in_file;
static char *kw_name;
static void push_back (char *p)
{
assert (! pushed_back);
pushed_back = p;
}
static char *get_line (void)
{
char *res = pushed_back;
pushed_back = NULL;
if (res)
return (res);
else
{
static char *line_buf;
size_t line_len = 0;
if (line_buf)
free (line_buf);
line_buf = NULL;
while (line_len < 10 * 1024)
{
size_t old_len = line_len;
char *p;
line_buf = ensure (realloc (line_buf, line_len += 1024));
p = line_buf + old_len;
if (! fgets (p, line_len - old_len, in_file))
return (old_len ? line_buf : NULL);
old_len = strlen (p);
if (! old_len)
break;
p += old_len - 1;
if (*p == '\n')
{
*p = '\0';
break;
}
line_len = (size_t) (p - line_buf) + 1;
}
return (line_buf);
}
}
The next_token routine assumes that t_end points to the first character worth considering for fetching a new token. This is usually where the previous token ended, hence the name, but can also be the first character of a new line.
The steps are:
static char *t_start, *t_end, *prefix;
static int next_token (void)
{
int this_type;
t_start = t_end;
while ((this_type = type_of (*t_start)) == space_char)
char_out (*t_start++);
if (this_type == nul_char)
return (nul_char);
t_end = t_start;
if (this_type == quote_char)
{
int q = *t_start;
for (;;)
{
int ch = *++t_end;
if (ch == '\0')
break;
else if (ch == q)
{
t_end += 1;
break;
}
else if ((type_of (ch) & escape_char) && t_end [1])
t_end += 1;
}
}
else
{
int mask = (this_type & (id_char | punct_char));
while (type_of (*++t_end) & mask)
;
}
return (this_type);
}
Before an identifier is output it's checked to see if it's a keyword, so that the type style can be changed if necessary. The keywords are read from an initialisation file and stored in a traditional linked list.
If the user requires that variable spelling be standardised, I also keep a list of variables that have been seen. Hopefully, this will not be a common occurence, so I feel justified in "cheating" here.
typedef struct keyword
{
char *spelling;
size_t len;
struct keyword *next;
} keyword;
static keyword *kw_base;
static keyword *var_base;
typedef int (*compare_fn) (const char *a, const char *b, size_t len);
static compare_fn kwcomp = (strncmp);
A non-ANSI string compare function that comes in handy
#if defined (__MSDOS__)
static compare_fn nocase_cmp = (strnicmp);
#elif defined (__unix)
static int nocase_cmp (const char *a, const char *b, size_t len)
{
int res = 0;
while (res == 0 && (*a || *b) && len--)
res = *a++ - *b++;
return (res);
}
#endif
static char *find_keyword (void)
{
keyword *p = kw_base;
size_t len = (size_t) (t_end - t_start);
while (p)
{
if (len == p -> len && kwcomp (p -> spelling, t_start, len) == 0)
return (p -> spelling);
p = p -> next;
}
return (NULL);
}
static char *add_keyword (void)
{
char *spelling = find_keyword ();
if (! spelling)
{
size_t len = (size_t) (t_end - t_start);
keyword *p = ensure (malloc (sizeof (keyword)));
spelling = ensure (malloc (len));
p -> next = kw_base;
p -> len = len;
memcpy (p -> spelling = spelling, t_start, len);
kw_base = p;
}
return (spelling);
}
The keywords section of the initialisation file is simply a list of identifiers. These are read (by calling next_token) and stored away in the list, until we get either EOF or something starting '[' (which we'll take as meaning "end-of-section"). Comments start, as usual, with a semicolon. Unusually, this must be in the first column to allow for the odd cases where you really need a semicolon in your keywords.
static void read_keywords (void)
{
while ((t_end = get_line ()) != NULL)
if (*t_end != ';')
{
int this_type;
while ((this_type = next_token ()) != nul_char)
{
if (*t_start == '[')
return;
if (this_type == id_char)
add_keyword ();
}
}
}
At various points in what follows we'll be changing the "typeface" between normal, bold (actually <strong>), and italic (<em>). In fact, whenever we want bold we'll call set_face(bold), no matter what was selected before. It's up to set_face() to remember what we've got, how to turn it off, and whether to supress redundant switches.
enum
{
normal,
italics,
bold
};
static char *strong, *em, *pre;
static void set_face (int new_face)
{
static int old_face;
if (new_face == old_face)
return;
switch (old_face)
{
case normal: break;
case italics: etag_out (em); break;
case bold: etag_out (strong); break;
}
switch (old_face = new_face)
{
case normal: break;
case italics: stag_out (em); break;
case bold: stag_out (strong); break;
}
}
Now come the routines that do the output, depending on what mode we happen to find ourselves in. At the start, we're not_decided, since we don't know if the first non-hidden line will be code or documentation.
static enum
{
not_decided,
hiding,
doing_code,
doing_doc
} mode_now, prev_mode;
static int comment_depth;
The two check_... routines are called to make sure the line we're working on isn't one of the magic marker lines. If it is magic, we do a mode-switch and drop the line.
static int check_hidden (void)
{
if (strstr (t_start, "<" "hide" ">"))
{
prev_mode = mode_now;
mode_now = hiding;
return (1);
}
else
return (0);
}
static int hide_code;
static int check_doc (void)
{
char *end_here = strstr (t_start, "<" "doc" ">");
if (end_here)
{
size_t pre_len = (size_t) (end_here - t_start);
if (prefix)
free (prefix);
prefix = ensure (malloc (1 + pre_len));
if (pre_len)
memcpy (prefix, t_start, pre_len);
prefix [pre_len] = '\0';
set_face (normal);
if (mode_now == doing_code && ! hide_code)
etag_out (pre);
mode_now = doing_doc;
return (1);
}
else
return (0);
}
Comments are particularly nasty. Some languages allow them to nest, others don't. Some comments are delimited at both ends, some just extend to the end-of-line marker. And some languages have multiple types of comment marker. Looks like another job for linked lists.
typedef struct comment
{
char *start;
char *end;
int nestable;
struct comment *next;
} comment;
static comment *comment_base, *this_comment;
static void do_comments (void)
{
char *end_on = this_comment -> end;
char *end_at = strstr (t_start, end_on);
if (end_at)
end_at += strlen (end_on);
if (this_comment -> nestable)
{
char *start_on = this_comment -> start;
char *start_at = strstr (t_start, start_on);
if (start_at)
{
start_at += strlen (start_on);
if (end_at == NULL || end_at > start_at)
{
comment_depth += 2; /* we'll subtract 1 again later ... */
end_at = start_at;
}
}
}
if (end_at)
{
while (t_start < end_at)
char_out (*t_start++);
comment_depth -= 1; /* told you so! */
push_back (t_start);
}
else
{
str_out (t_start);
putchar ('\n');
}
}
Here's where we check for a start-of-comment. Check the sequence of characters we're just about to output, hoping that it matches one of the "open comment" patterns. If so, emit the line (in italics) if it's a "rest-of-line" style comment, or push back the comment text and start a "block comment". If it's not a comment, return 0 and it'll get output as code.
static int start_comment (void)
{
comment *p;
for (p = comment_base; p; p = p -> next)
{
size_t len = strlen (p -> start);
if (t_end - t_start >= len && memcmp (t_start, p -> start, len) == 0)
{
set_face (italics);
str_out (p -> start);
t_start += len;
if (p -> end)
{
comment_depth = 1;
this_comment = p;
push_back (t_start);
}
else
{
str_out (t_start);
putchar ('\n');
}
return (1);
}
}
return (0);
}
The add_comment() routine puts things into the list, according to what gets found in the initialisation file.
static void add_comment (char *start, char *end, int nest)
{
comment *p = ensure (malloc (sizeof (comment)));
size_t len = strlen (start);
p -> start = ensure (malloc (len + 1));
strcpy (p -> start, start);
if (end)
{
len = strlen (end);
p -> end = ensure (malloc (len + 1));
strcpy (p -> end, end);
}
else
p -> end = 0;
p -> nestable = nest;
p -> next = comment_base;
comment_base = p;
}
Output the identifier in the preferred style of spelling: upper-case, lower-case, standardised, or as found in the source file.
static void correct_spelling (int face, char *std, int type)
{
set_face (face);
while (t_start < t_end)
{
switch (type)
{
case 'u': putchar (toupper (*t_start)); break;
case 'l': putchar (tolower (*t_start)); break;
case 's': putchar (*std++); break;
default: putchar (*t_start); break;
}
t_start += 1;
}
}
static int kw_fold, var_fold;
static void print_id (void)
{
char *p = find_keyword ();
if (p)
correct_spelling (bold, p, kw_fold);
else
{
if (var_fold == 's')
{
If we've been asked to standardise variable spellings we've got to keep a list of what we've seen so far. This is going to make things run very slowly if we've got millions of variables defined.
keyword *keep_kw_base = kw_base;
kw_base = var_base;
p = add_keyword ();
var_base = kw_base;
kw_base = keep_kw_base;
}
else
p = t_start;
correct_spelling (normal, p, var_fold);
}
}
The processing for a line of code: emit the tokens one by one until told otherwise.
static void do_code_line (void)
{
int this_type;
if (check_hidden () || check_doc () || hide_code)
return;
if (comment_depth)
{
do_comments ();
return;
}
while ((this_type = next_token ()) != nul_char)
{
switch (this_type)
{
case id_char:
print_id ();
break;
case quote_char:
set_face (italics);
while (t_start < t_end)
char_out (*t_start++);
break;
case punct_char:
if (start_comment ())
return;
set_face (bold);
char_out (*t_start++);
t_end = t_start;
break;
}
}
char_out ('\n');
}
The processing for a line of documentation: if we're not ending the doc just print the line, remembering to remove anything it might have in common with the "start doc" line.
static void do_doc_line (void)
{
if (check_hidden ())
return;
if (strstr (t_start, "<" "/doc" ">"))
{
if (! hide_code)
stag_out (pre);
mode_now = doing_code;
if (comment_depth)
set_face (italics);
return;
}
if (prefix)
{
char *p = prefix;
while (*p && *p == *t_start)
{
p += 1;
t_start += 1;
}
}
fputs (t_start, stdout);
putchar ('\n');
}
And, of course, hiding a line is simplest of all.
static void do_hiding_biz (void)
{
if (strstr (t_start, "<" "/hide" ">"))
mode_now = prev_mode;
}
This is the handler for "don't know" mode. If the line isn't the start of documentation, check to see if there's a token on it. If there is, push back the whole line and read it again as code.
static void decide_then (void)
{
char *start_here = t_start;
if (check_hidden ())
return;
if (check_doc ())
{
quiet = 0;
return;
}
if (next_token () != nul_char)
{
quiet = 0;
push_back (start_here);
mode_now = doing_code;
if (! hide_code)
stag_out (pre);
}
}
Are two strings "the same", assuming blanks and case are not significant?
static int matching_strings (char *a, char *b)
{
for (;;)
{
while (*a && *a <= ' ')
a += 1;
while (*b && *b <= ' ')
b += 1;
if (tolower (*a) != tolower (*b))
return (0);
if (! *a)
return (1);
a += 1;
b += 1;
}
}
Is one string a prefix of the other, in the style of matching_strings() above?
static int is_prefix (char *a, char *b)
{
for (;;)
{
while (*a && *a <= ' ')
a += 1;
while (*b && *b <= ' ')
b += 1;
if (! *b)
return (1);
if (tolower (*a) != tolower (*b))
return (0);
a += 1;
b += 1;
}
}
The initialisation file contains sections delimited by headings wrapped in [...] characters. For each language I'll be interested in the [<lang> Characters] and [<lang> Keywords] sections. This is where we do the looking.
static void find_section (char *section_name)
{
char *line;
fseek (in_file, 0, SEEK_SET);
while ((line = get_line ()) != NULL)
if (matching_strings (line, section_name))
return;
fprintf (stderr, "Can't find %s section in %s\n", section_name, kw_name);
exit (1);
}
Here's a horrible routine.
Assuming we're positioned at the right section, read a collection of lines of the form param=val and, if the param is recognised, process the val in some way. The params understood so far are:
Setting any value for variable case or keyword case indicates that the language is not case sensitive.
static void read_setup (void)
{
char *line;
char *equals;
while ((line = trim (get_line ())) != NULL)
{
if (*line == '[')
return;
equals = strchr (line, '=');
if (equals)
{
static struct kwvals
{
char *kw;
int val;
} params [] =
{
{"quote=", quote_char},
{"quotes=", quote_char},
{"letter=", id_char},
{"letters=", id_char},
{"escape=", escape_char},
{"escapes=", escape_char},
{NULL, 0}
};
struct kwvals *p;
equals = trim (equals + 1);
if (is_prefix (line, "keyword case="))
{
switch (kw_fold = tolower (*equals))
{
case 's': case 'u': case 'l': break;
default: kw_fold = '-'; break;
}
}
else if (is_prefix (line, "variable case="))
{
switch (var_fold = tolower (*equals))
{
case 's': case 'u': case 'l': break;
default: var_fold = '-'; break;
}
}
else if (is_prefix (line, "comment=") || is_prefix (line, "comments="))
{
char *end = equals + strlen (equals);
char *close_with = equals;
while (*close_with > ' ')
close_with += 1;
if (*close_with == '\0')
add_comment (equals, NULL, 0);
else
{
*close_with++ = '\0';
close_with = trim (close_with);
end = close_with;
while (*end > ' ')
end += 1;
*end++ = '\0';
add_comment (equals, close_with, *end);
}
}
else
for (p = params; p -> kw; p += 1)
if (is_prefix (line, p -> kw))
{
init_from_string (equals, p -> val);
break;
}
}
}
}
A marginally less horrible routine. Here we load the [General] section, which defines the HTML tags to use.
static void read_general (void)
{
char *line;
char *equals;
while ((line = trim (get_line ())) != NULL)
{
if (*line == '[')
return;
equals = strchr (line, '=');
if (equals)
{
static struct kwvals
{
char *kw;
char **val;
} params [] =
{
{"keyword tag=", &strong},
{"comment tag=", &em},
{"code tag=", &pre},
{NULL, NULL}
};
struct kwvals *p;
equals = trim (equals + 1);
for (p = params; p -> kw; p += 1)
if (is_prefix (line, p -> kw))
{
char **addr = p -> val;
if (*addr)
free (*addr);
*addr = ensure (malloc (1 + strlen (equals)));
strcpy (*addr, equals);
break;
}
}
}
}
This is where we make the decision as to which language we're using and whether we're supressing all but the documentation text.
static int init_language (int argc, char **argv)
{
if (argc > 1 && strcmp (argv [1], "-d") == 0)
{
argc -= 1;
argv += 1;
hide_code = 1;
}
if (argc != 2)
return (0);
if ((in_file = fopen ((kw_name = argv [1]), "r")) == NULL)
{
fprintf (stderr, "Can't open '%s'\n", kw_name);
exit (1);
}
find_section ("[General]");
read_general ();
if (! em)
em = "em";
if (! pre)
pre = "pre";
if (! strong)
strong = "strong";
find_section ("[Characters]");
read_setup ();
if (kw_fold || var_fold)
kwcomp = nocase_cmp;
find_section ("[Keywords]");
read_keywords ();
fclose (in_file);
return (1);
}
Given the above, the main routine is quite simple. Do a bit of set-up, initialise according to language, then read stdin and process each line accordingly.
int main (int argc, char **argv)
{
quiet = 1;
init_types ();
if (isatty (fileno (stdin)) || ! init_language (argc, argv))
{
fprintf (stderr, "Usage: %s [-d] kw-file <source >dest.html\n", argv [0]);
exit (1);
}
set_punct ();
in_file = stdin;
while ((t_start = t_end = get_line ()) != NULL)
{
switch (mode_now)
{
case doing_code: do_code_line (); break;
case doing_doc: do_doc_line (); break;
case hiding: do_hiding_biz (); break;
case not_decided: decide_then (); break;
}
}
if (mode_now == hiding)
mode_now = prev_mode;
switch (mode_now)
{
case not_decided:
fprintf (stderr, "Nothing useful done\n");
return (1);
case doing_code:
set_face (normal);
if (! hide_code)
{
etag_out (pre);
putchar ('\n');
}
default:
break;
}
return (0);
}