Reworked encode() method: The hierarchy of HTML tags, macro tags and HTML

comments is now right. Also made the method perform a bit faster by not creating
in intermediate char array. Fixes bug 152.
This commit is contained in:
hns 2002-11-05 18:52:30 +00:00
parent 1daa648bea
commit 7f9c5d65c0

View file

@ -294,50 +294,61 @@ public final class HtmlEncoder {
} }
/** /**
* * Do "smart" encodging on a string. This means that valid HTML entities and tags,
* Helma macros and HTML comments are passed through unescaped, while
* other occurrences of '<', '>' and '&' are encoded to HTML entities.
*/ */
public final static void encode (String str, StringBuffer ret) { public final static void encode (String str, StringBuffer ret) {
if (str == null) if (str == null)
return; return;
char[] chars = str.toCharArray (); int l = str.length();
int l = chars.length;
// are we currently within a < and a >? // are we currently within a < and a > that consitute some kind of tag?
// we use tag balancing to know whether we are inside a tag (and should
// pass things through unchanged) or outside (and should encode stuff).
boolean insideTag = false; boolean insideTag = false;
// are we inside an HTML tag?
boolean insideHtmlTag = false;
// if we are inside a <code> tag, we encode everything to make // if we are inside a <code> tag, we encode everything to make
// documentation work easier // documentation work easier
boolean insideCodeTag = false; boolean insideCodeTag = false;
// are we within a macro tag? // are we within a Helma <% macro %> tag? We treat macro tags and
// comments specially, since we can't rely on tag balancing
// to know when we leave a macro tag or comment.
boolean insideMacroTag = false; boolean insideMacroTag = false;
// are we inside an HTML comment? // are we inside an HTML comment?
boolean insideComment = false; boolean insideComment = false;
// the quotation mark we are in within an HTML or Macro tag, if any
char htmlQuoteChar = '\u0000';
char macroQuoteChar = '\u0000';
// the difference between swallowOneNewline and ignoreNewline is that // the difference between swallowOneNewline and ignoreNewline is that
// swallowOneNewline is just effective once (for the next newline) // swallowOneNewline is just effective once (for the next newline)
boolean ignoreNewline = false; boolean ignoreNewline = false;
boolean swallowOneNewline = false; boolean swallowOneNewline = false;
for (int i=0; i<l; i++) { for (int i=0; i<l; i++) {
char c = chars[i]; char c = str.charAt(i);
switch (c) { switch (c) {
case '&': case '&':
// check if this is an HTML entity already, in which case we pass it though unchanged // check if this is an HTML entity already,
// in which case we pass it though unchanged
if (i < l-3 && !insideCodeTag) { if (i < l-3 && !insideCodeTag) {
// is this a numeric entity? // is this a numeric entity?
if (chars[i+1] == '#' ) { if (str.charAt(i+1) == '#' ) {
int j = i+2; int j = i+2;
while (j<l && Character.isDigit (chars[j])) while (j<l && Character.isDigit (str.charAt(j)))
j++; j++;
if (j<l && chars[j] == ';') { if (j<l && str.charAt(j) == ';') {
ret.append ("&"); ret.append ("&");
break; break;
} }
} else { } else {
int j = i+1; int j = i+1;
while (j<l && Character.isLetterOrDigit (chars[j])) while (j<l && Character.isLetterOrDigit (str.charAt(j)))
j++; j++;
if (j<l && chars[j] == ';') { if (j<l && str.charAt(j) == ';') {
ret.append ("&"); ret.append ("&");
break; break;
} }
@ -347,35 +358,32 @@ public final class HtmlEncoder {
ret.append ("&amp;"); ret.append ("&amp;");
break; break;
case '<': case '<':
if (insideTag) { if (i < l-2) {
ret.append ('<'); if (!insideMacroTag && '%' == str.charAt(i+1)) {
break; // this is the beginning of a Helma macro tag
} else if (i < l-2) { insideMacroTag = insideTag = true;
boolean insideCloseTag = ('/' == chars[i+1]); macroQuoteChar = '\u0000';
} else if ('!' == str.charAt(i+1) && '-' == str.charAt(i+2)) {
// the beginning of an HTML comment?
insideComment = insideTag = (i<l-3 && '-' == str.charAt(i+3));
} else if (!insideTag) {
// check if this is a HTML tag.
boolean insideCloseTag = ('/' == str.charAt(i+1));
int tagStart = insideCloseTag ? i+2 : i+1; int tagStart = insideCloseTag ? i+2 : i+1;
int j = tagStart; int j = tagStart;
while (j<l && Character.isLetterOrDigit (chars[j])) while (j<l && Character.isLetterOrDigit (str.charAt(j)))
j++; j++;
// if we haven't gotten past the <, // if we haven't gotten past the <,
// check if it's an HTML comment or Helma macro tag // check if it's an HTML comment or Helma macro tag
if (j == tagStart && !insideCodeTag) { // if (j == tagStart && !insideCodeTag) {
if ('%' == chars[j]) { // }
insideMacroTag = insideTag = true;
ret.append ('<');
continue;
} else if (j < l-2 && '!' == chars[j] && '-' == chars[j+1] && '-' == chars[j+2]) {
insideComment = insideTag = true;
ret.append ('<');
continue;
}
}
if (j > tagStart && j < l) { if (j > tagStart && j < l) {
String tagName = new String (chars, tagStart, j-tagStart).toLowerCase (); String tagName = str.substring (tagStart, j).toLowerCase();
if ("code".equals (tagName) && insideCloseTag && insideCodeTag) if ("code".equals (tagName) && insideCloseTag && insideCodeTag)
insideCodeTag = false; insideCodeTag = false;
if (allTags.contains (tagName) && !insideCodeTag) { if (allTags.contains (tagName) && !insideCodeTag) {
insideTag = true; insideHtmlTag = insideTag = true;
ret.append ('<'); htmlQuoteChar = '\u0000';
// set ignoreNewline on some tags, depending on wheather they're // set ignoreNewline on some tags, depending on wheather they're
// being opened or closed. // being opened or closed.
// what's going on here? we switch newline encoding on inside some tags, for // what's going on here? we switch newline encoding on inside some tags, for
@ -391,31 +399,60 @@ public final class HtmlEncoder {
} }
if ("code".equals (tagName) && !insideCloseTag) if ("code".equals (tagName) && !insideCloseTag)
insideCodeTag = true; insideCodeTag = true;
break; }
} }
} }
} // if (i < l-2) } // if (i < l-2)
if (insideTag)
ret.append ('<');
else
ret.append ("&lt;"); ret.append ("&lt;");
break; break;
case '"':
case '\'':
ret.append (c);
if (!insideComment) {
if (insideMacroTag) {
if (macroQuoteChar == c)
macroQuoteChar = '\u0000';
else if (macroQuoteChar == '\u0000')
macroQuoteChar = c;
} else if (insideHtmlTag) {
if (htmlQuoteChar == c)
htmlQuoteChar = '\u0000';
else if (htmlQuoteChar == '\u0000')
htmlQuoteChar = c;
}
}
break;
case '\n': case '\n':
ret.append ('\n'); ret.append ('\n');
if (!insideTag && !ignoreNewline && !swallowOneNewline) if (!insideTag) {
if (!ignoreNewline && !swallowOneNewline)
ret.append ("<br />"); ret.append ("<br />");
if (!insideTag)
swallowOneNewline = false; swallowOneNewline = false;
}
break; break;
case '>': case '>':
if (insideTag) { // For Helma macro tags and comments, we overrule tag balancing,
// i.e. we don't require that '<' and '>' be balanced within
// macros and comments. Rather, we check for the matching closing tag.
if (insideComment) {
ret.append ('>'); ret.append ('>');
if (insideMacroTag) insideComment = !(str.charAt(i-2) == '-' && str.charAt(i-1) == '-');
insideMacroTag = insideTag = !(chars[i-1] == '%'); } else if (insideMacroTag) {
else if (insideComment) ret.append ('>');
insideComment = insideTag = !(chars[i-2] == '-' && chars[i-1] == '-'); insideMacroTag = !(str.charAt(i-1) == '%' && macroQuoteChar == '\u0000');
else } else if (insideHtmlTag) {
insideTag = false; ret.append ('>');
// only leave HTML tag if quotation marks are balanced
// within that tag.
insideHtmlTag = htmlQuoteChar != '\u0000';
} else { } else {
ret.append ("&gt;"); ret.append ("&gt;");
} }
// check if we still are inside any kind of tag
insideTag = insideComment || insideMacroTag || insideHtmlTag;
break; break;
default: default:
// ret.append (c); // ret.append (c);
@ -428,7 +465,7 @@ public final class HtmlEncoder {
ret.append ((int) c); ret.append ((int) c);
ret.append (";"); ret.append (";");
} }
if (!insideTag && !Character.isWhitespace (c)) if (swallowOneNewline && !insideTag && !Character.isWhitespace (c))
swallowOneNewline = false; swallowOneNewline = false;
} }
} }