%{
/*
Copyright  1998 by John Lindal. All rights reserved.

This scanner reads an HTML file and calls the
following virtual functions:

	HandleHTMLWord
		Insert the given word.

	HandleHTMLWhitespace
		Insert whitespace as appropriate.

	HandleHTMLTag
		Handle the tag with the given name and attributes.

	HandleHTMLComment
		Handle the given comment.

	HandleHTMLError
		The specified error occurred.
*/

#define _H_JHTMLScannerL

#include <JHTMLScanner.h>
#include <JStringPtrMap.h>
#include <JString.h>
#include <stdlib.h>
#include <jAssert.h>

// This is the easiest way to keep track of our position in the text.

#define YY_USER_ACTION	UpdatePosition();

#define HANDLE_CHAR(c)	if (!HandleChar(c)) { return 0; }
%}

%option c++ yyclass="JHTMLScanner" prefix="JHTML"
%option 8bit nodefault stack

%x TAG_STATE COMMENT_STATE TOSS_TAG_STATE
%x ATTR_NAME_STATE
%x INIT_ATTR_VALUE_STATE UNQUOTED_ATTR_VALUE_STATE QUOTED_ATTR_VALUE_STATE
%x CHAR_ESC_STATE

WS	[[:space:]]+

%%

%{
	BEGIN(INITIAL);
%}

"<" {
	NewTag();
	BEGIN(TAG_STATE);
	}

"<!--" {
	itsCommentRange.first = itsMatchRange.first;
	BEGIN(COMMENT_STATE);
	}

"&" {
	yy_push_state(CHAR_ESC_STATE);
	}

{WS} {
	if (!HandleHTMLWhitespace(yytext, itsMatchRange))
		{
		return 0;
		}
	}

[^<&[:space:]]+ {
	if (!HandleHTMLWord(yytext, itsMatchRange))
		{
		return 0;
		}
	}

	/* This catches and ignores the newline when <pre> is on a separate line */

"<pre>"[ \t\r\f\v]*\n {
	NewTag();
	(itsTagInfo->name).Set("pre");
	if (!TagFinished())
		{
		return 0;
		}
	}

\n[ \t\r\f\v]*"</pre>" {
	NewTag();
	(itsTagInfo->name).Set("/pre");
	if (!TagFinished())
		{
		return 0;
		}
	}


	/* This grabs the name of the tag and looks for attributes */


<TAG_STATE>{

{WS}	/* ignore */

">" {
	if (!TagFinished())
		{
		return 0;
		}
	BEGIN(INITIAL);
	}

[^>[:space:]]+ {
	(itsTagInfo->name).Set(yytext, yyleng);
	(itsTagInfo->name).ToLower();
	BEGIN(ATTR_NAME_STATE);
	}

}


	/* This grabs the attribute name and looks for a value */


<ATTR_NAME_STATE>{

{WS}	/* ignore */

">" {
	if (!TagFinished())
		{
		return 0;
		}
	BEGIN(INITIAL);
	}

=[[:space:]]* {
	if ((itsTagInfo->lastAttr).IsEmpty())
		{
		if (!HandleHTMLError(kEmptyAttrName, "*** empty attribute name ***", JIndexRange()))
			{
			return 0;
			}
		BEGIN(TOSS_TAG_STATE);
		}
	else
		{
		BEGIN(INIT_ATTR_VALUE_STATE);
		}
	}

[^>=[:space:]]+ {
	(itsTagInfo->lastAttr).Set(yytext, yyleng);
	(itsTagInfo->lastAttr).ToLower();
	JString* s = NULL;
	(itsTagInfo->attr).SetElement(itsTagInfo->lastAttr, s);
	}

}


	/* This checks whether or not the attribute value is quoted */


<INIT_ATTR_VALUE_STATE>{

">" {
	if (!TagFinished())
		{
		return 0;
		}
	BEGIN(INITIAL);
	}

\" {
	BEGIN(QUOTED_ATTR_VALUE_STATE);
	}

[^">] {
	yyless(0);
	itsMatchRange.Set(itsMatchRange.first-1, itsMatchRange.first-1);
	BEGIN(UNQUOTED_ATTR_VALUE_STATE);
	}

}


	/* This grabs the attribute value */


<UNQUOTED_ATTR_VALUE_STATE>{

{WS} {
	SaveAttributeValue();
	BEGIN(ATTR_NAME_STATE);
	}

">" {
	SaveAttributeValue();
	if (!TagFinished())
		{
		return 0;
		}
	BEGIN(INITIAL);
	}

"&" {
	yy_push_state(CHAR_ESC_STATE);
	}

[^>&[:space:]]+ {
	(itsTagInfo->valueBuffer).Append(yytext, yyleng);
	}

}


	/* This grabs the quoted attribute value */


<QUOTED_ATTR_VALUE_STATE>{

\"  {
	SaveAttributeValue();
	BEGIN(ATTR_NAME_STATE);
	}

"&" {
	yy_push_state(CHAR_ESC_STATE);
	}

[^"&]+ {
	(itsTagInfo->valueBuffer).Append(yytext, yyleng);
	}

}


	/* This slurps up a comment */


<COMMENT_STATE>{

"-->" {
	itsCommentRange.last = itsMatchRange.last;
	if (!HandleHTMLComment(itsCommentRange))
		{
		return 0;
		}
	BEGIN(INITIAL);
	}

.|\n	/* ignore everything inside comments */

<<EOF>> {
	itsCommentRange.last = itsMatchRange.last;
	HandleHTMLComment(itsCommentRange);
	yyterminate();
	}

}


	/* This tosses the remainder of the tag if an error occurs */


<TOSS_TAG_STATE>{

">" {
	if (!TagFinished())		/* this goes here so JHTMLStyler gets the correct range */
		{
		return 0;
		}
	BEGIN(INITIAL);
	}

[^>]*	/* ignore everything inside tag */

}


	/* This converts legal character escapes (&) */
	/* Unlike the rest of HTML, these are case sensitive */


<CHAR_ESC_STATE>{

.|\n {
	yyless(0);			/* if nothing else matches, don't change the stream */
	itsMatchRange.Set(itsMatchRange.first-1, itsMatchRange.first-1);
	HANDLE_CHAR('&');
	}

<<EOF>> {
	/* itsMatchRange is correct because is isn't changed by <<EOF>> */
	HANDLE_CHAR('&');
	yyterminate();
	}

#[0-9]{1,3};? {
	HANDLE_CHAR(atoi(yytext+1));
	}

lt;?      HANDLE_CHAR('<');		/* less than */
gt;?      HANDLE_CHAR('>');		/* greater than */

amp;?     HANDLE_CHAR('&');		/* ampersand */
yen;?     HANDLE_CHAR('\xA5');	/* currency: yen */
uml;?     HANDLE_CHAR('\xA8');	/* umlaut */
not;?     HANDLE_CHAR('\xAC');	/* logical not */
shy;?     HANDLE_CHAR('\xAD');	/* subscript dash */
reg;?     HANDLE_CHAR('\xAE');	/* registered trademark */
deg;?     HANDLE_CHAR('\xB0');	/* degrees */
ETH;?     HANDLE_CHAR('\xD0');	/* Icelandic (capital) */
eth;?     HANDLE_CHAR('\xF0');	/* Icelandic (lowercase) */

quot;?    HANDLE_CHAR('"');		/* double quote */
nbsp;?    HANDLE_CHAR(' ');		/* non-breaking space */
cent;?    HANDLE_CHAR('\xA2');	/* cents */
sect;?    HANDLE_CHAR('\xA7');	/* part */
copy;?    HANDLE_CHAR('\xA9');	/* copyright */
ordf;?    HANDLE_CHAR('\xAA');	/* superscript underline a */
macr;?    HANDLE_CHAR('\xAF');	/* superscript dash */
sup1;?    HANDLE_CHAR('\xB9');	/* superscript 1 */
sup2;?    HANDLE_CHAR('\xB2');	/* superscript 2 */
sup3;?    HANDLE_CHAR('\xB3');	/* superscript 3 */
para;?    HANDLE_CHAR('\xB6');	/* paragraph */
ordm;?    HANDLE_CHAR('\xBA');	/* superscript underline o */
Auml;?    HANDLE_CHAR('\xC4');	/* upper case umlauts */
Euml;?    HANDLE_CHAR('\xCB');
Iuml;?    HANDLE_CHAR('\xCF');
Ouml;?    HANDLE_CHAR('\xD6');
Uuml;?    HANDLE_CHAR('\xDC');
auml;?    HANDLE_CHAR('\xE4');	/* lower case umlauts */
euml;?    HANDLE_CHAR('\xEB');
iuml;?    HANDLE_CHAR('\xEF');
ouml;?    HANDLE_CHAR('\xF6');
uuml;?    HANDLE_CHAR('\xFC');
yuml;?    HANDLE_CHAR('\xFF');	/* they really did leave out Yuml */

iexcl;?   HANDLE_CHAR('\xA1');	/* i */
pound;?   HANDLE_CHAR('\xA3');	/* currency: pound */
laquo;?   HANDLE_CHAR('\xAB');	/* << */
acute;?   HANDLE_CHAR('\xB4');	/* accent mark */
micro;?   HANDLE_CHAR('\xB5');	/* greek mu */
cedil;?   HANDLE_CHAR('\xB8');	/* accent mark */
raquo;?   HANDLE_CHAR('\xBB');	/* >> */
times;?   HANDLE_CHAR('\xD7');	/* multiplication sign */
Acirc;?   HANDLE_CHAR('\xC2');	/* upper case modified characters */
Aring;?   HANDLE_CHAR('\xC5');
AElig;?   HANDLE_CHAR('\xC6');
Ecirc;?   HANDLE_CHAR('\xCA');
Icirc;?   HANDLE_CHAR('\xCE');
Ocirc;?   HANDLE_CHAR('\xD4');
Ucirc;?   HANDLE_CHAR('\xDB');
THORN;?   HANDLE_CHAR('\xDE');
szlig;?   HANDLE_CHAR('\xDF');	/* German double s */
acirc;?   HANDLE_CHAR('\xE2');	/* lower case modified characters */
aring;?   HANDLE_CHAR('\xE5');
aelig;?   HANDLE_CHAR('\xE6');
ecirc;?   HANDLE_CHAR('\xEA');
icirc;?   HANDLE_CHAR('\xEE');
ocirc;?   HANDLE_CHAR('\xF4');
ucirc;?   HANDLE_CHAR('\xFB');
thorn;?   HANDLE_CHAR('\xFE');

curren;?  HANDLE_CHAR('\xA4');	/* currency: star */
brvbar;?  HANDLE_CHAR('\xA6');	/* vertical bar */
plusmn;?  HANDLE_CHAR('\xB1');	/* plus minus */
middot;?  HANDLE_CHAR('\xB7');	/* dot */
frac14;?  HANDLE_CHAR('\xBC');	/* 1/4 */
frac12;?  HANDLE_CHAR('\xBD');	/* 1/2 */
frac34;?  HANDLE_CHAR('\xBE');	/* 3/4 */
iquest;?  HANDLE_CHAR('\xBF');	/* upside down question mark */
Agrave;?  HANDLE_CHAR('\xC0');
Aacute;?  HANDLE_CHAR('\xC1');
Atilde;?  HANDLE_CHAR('\xC3');
Ccedil;?  HANDLE_CHAR('\xC7');
Egrave;?  HANDLE_CHAR('\xC8');
Eacute;?  HANDLE_CHAR('\xC9');
Igrave;?  HANDLE_CHAR('\xCC');
Iacute;?  HANDLE_CHAR('\xCD');
Ntilde;?  HANDLE_CHAR('\xD1');
Ograve;?  HANDLE_CHAR('\xD2');
Oacute;?  HANDLE_CHAR('\xD3');
Otilde;?  HANDLE_CHAR('\xD5');
Oslash;?  HANDLE_CHAR('\xD8');
Ugrave;?  HANDLE_CHAR('\xD9');
Uacute;?  HANDLE_CHAR('\xDA');
Yacute;?  HANDLE_CHAR('\xDD');
agrave;?  HANDLE_CHAR('\xE0');
aacute;?  HANDLE_CHAR('\xE1');
atilde;?  HANDLE_CHAR('\xE3');
ccedil;?  HANDLE_CHAR('\xE7');
egrave;?  HANDLE_CHAR('\xE8');
eacute;?  HANDLE_CHAR('\xE9');
igrave;?  HANDLE_CHAR('\xEC');
iacute;?  HANDLE_CHAR('\xED');
ntilde;?  HANDLE_CHAR('\xF1');
ograve;?  HANDLE_CHAR('\xF2');
oacute;?  HANDLE_CHAR('\xF3');
otilde;?  HANDLE_CHAR('\xF5');
divide;?  HANDLE_CHAR('\xF7');	/* division sign */
oslash;?  HANDLE_CHAR('\xF8');
ugrave;?  HANDLE_CHAR('\xF9');
uacute;?  HANDLE_CHAR('\xFA');
yacute;?  HANDLE_CHAR('\xFD');

}

%%
