Reworked encode() method: The hierarchy of HTML tags, macro tags and HTML
comments is now right. Also made the method perform a bit faster by not creating in intermediate char array. Fixes bug 152.
This commit is contained in:
parent
1daa648bea
commit
7f9c5d65c0
1 changed files with 106 additions and 69 deletions
|
@ -294,50 +294,61 @@ public final class HtmlEncoder {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Do "smart" encodging on a string. This means that valid HTML entities and tags,
|
||||||
|
* Helma macros and HTML comments are passed through unescaped, while
|
||||||
|
* other occurrences of '<', '>' and '&' are encoded to HTML entities.
|
||||||
*/
|
*/
|
||||||
public final static void encode (String str, StringBuffer ret) {
|
public final static void encode (String str, StringBuffer ret) {
|
||||||
if (str == null)
|
if (str == null)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
char[] chars = str.toCharArray ();
|
int l = str.length();
|
||||||
int l = chars.length;
|
|
||||||
|
|
||||||
// are we currently within a < and a >?
|
// are we currently within a < and a > that consitute some kind of tag?
|
||||||
boolean insideTag=false;
|
// we use tag balancing to know whether we are inside a tag (and should
|
||||||
|
// pass things through unchanged) or outside (and should encode stuff).
|
||||||
|
boolean insideTag = false;
|
||||||
|
// are we inside an HTML tag?
|
||||||
|
boolean insideHtmlTag = false;
|
||||||
// if we are inside a <code> tag, we encode everything to make
|
// if we are inside a <code> tag, we encode everything to make
|
||||||
// documentation work easier
|
// documentation work easier
|
||||||
boolean insideCodeTag = false;
|
boolean insideCodeTag = false;
|
||||||
// are we within a macro tag?
|
// are we within a Helma <% macro %> tag? We treat macro tags and
|
||||||
|
// comments specially, since we can't rely on tag balancing
|
||||||
|
// to know when we leave a macro tag or comment.
|
||||||
boolean insideMacroTag = false;
|
boolean insideMacroTag = false;
|
||||||
// are we inside an HTML comment?
|
// are we inside an HTML comment?
|
||||||
boolean insideComment = false;
|
boolean insideComment = false;
|
||||||
|
// the quotation mark we are in within an HTML or Macro tag, if any
|
||||||
|
char htmlQuoteChar = '\u0000';
|
||||||
|
char macroQuoteChar = '\u0000';
|
||||||
// the difference between swallowOneNewline and ignoreNewline is that
|
// the difference between swallowOneNewline and ignoreNewline is that
|
||||||
// swallowOneNewline is just effective once (for the next newline)
|
// swallowOneNewline is just effective once (for the next newline)
|
||||||
boolean ignoreNewline = false;
|
boolean ignoreNewline = false;
|
||||||
boolean swallowOneNewline = false;
|
boolean swallowOneNewline = false;
|
||||||
|
|
||||||
for (int i=0; i<l; i++) {
|
for (int i=0; i<l; i++) {
|
||||||
char c = chars[i];
|
char c = str.charAt(i);
|
||||||
|
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case '&':
|
case '&':
|
||||||
// check if this is an HTML entity already, in which case we pass it though unchanged
|
// check if this is an HTML entity already,
|
||||||
|
// in which case we pass it though unchanged
|
||||||
if (i < l-3 && !insideCodeTag) {
|
if (i < l-3 && !insideCodeTag) {
|
||||||
// is this a numeric entity?
|
// is this a numeric entity?
|
||||||
if (chars[i+1] == '#' ) {
|
if (str.charAt(i+1) == '#' ) {
|
||||||
int j = i+2;
|
int j = i+2;
|
||||||
while (j<l && Character.isDigit (chars[j]))
|
while (j<l && Character.isDigit (str.charAt(j)))
|
||||||
j++;
|
j++;
|
||||||
if (j<l && chars[j] == ';') {
|
if (j<l && str.charAt(j) == ';') {
|
||||||
ret.append ("&");
|
ret.append ("&");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
int j = i+1;
|
int j = i+1;
|
||||||
while (j<l && Character.isLetterOrDigit (chars[j]))
|
while (j<l && Character.isLetterOrDigit (str.charAt(j)))
|
||||||
j++;
|
j++;
|
||||||
if (j<l && chars[j] == ';') {
|
if (j<l && str.charAt(j) == ';') {
|
||||||
ret.append ("&");
|
ret.append ("&");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -347,35 +358,32 @@ public final class HtmlEncoder {
|
||||||
ret.append ("&");
|
ret.append ("&");
|
||||||
break;
|
break;
|
||||||
case '<':
|
case '<':
|
||||||
if (insideTag) {
|
if (i < l-2) {
|
||||||
ret.append ('<');
|
if (!insideMacroTag && '%' == str.charAt(i+1)) {
|
||||||
break;
|
// this is the beginning of a Helma macro tag
|
||||||
} else if (i < l-2) {
|
insideMacroTag = insideTag = true;
|
||||||
boolean insideCloseTag = ('/' == chars[i+1]);
|
macroQuoteChar = '\u0000';
|
||||||
|
} else if ('!' == str.charAt(i+1) && '-' == str.charAt(i+2)) {
|
||||||
|
// the beginning of an HTML comment?
|
||||||
|
insideComment = insideTag = (i<l-3 && '-' == str.charAt(i+3));
|
||||||
|
} else if (!insideTag) {
|
||||||
|
// check if this is a HTML tag.
|
||||||
|
boolean insideCloseTag = ('/' == str.charAt(i+1));
|
||||||
int tagStart = insideCloseTag ? i+2 : i+1;
|
int tagStart = insideCloseTag ? i+2 : i+1;
|
||||||
int j = tagStart;
|
int j = tagStart;
|
||||||
while (j<l && Character.isLetterOrDigit (chars[j]))
|
while (j<l && Character.isLetterOrDigit (str.charAt(j)))
|
||||||
j++;
|
j++;
|
||||||
// if we haven't gotten past the <,
|
// if we haven't gotten past the <,
|
||||||
// check if it's an HTML comment or Helma macro tag
|
// check if it's an HTML comment or Helma macro tag
|
||||||
if (j == tagStart && !insideCodeTag) {
|
// if (j == tagStart && !insideCodeTag) {
|
||||||
if ('%' == chars[j]) {
|
// }
|
||||||
insideMacroTag = insideTag = true;
|
|
||||||
ret.append ('<');
|
|
||||||
continue;
|
|
||||||
} else if (j < l-2 && '!' == chars[j] && '-' == chars[j+1] && '-' == chars[j+2]) {
|
|
||||||
insideComment = insideTag = true;
|
|
||||||
ret.append ('<');
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (j > tagStart && j < l) {
|
if (j > tagStart && j < l) {
|
||||||
String tagName = new String (chars, tagStart, j-tagStart).toLowerCase ();
|
String tagName = str.substring (tagStart, j).toLowerCase();
|
||||||
if ("code".equals (tagName) && insideCloseTag && insideCodeTag)
|
if ("code".equals (tagName) && insideCloseTag && insideCodeTag)
|
||||||
insideCodeTag = false;
|
insideCodeTag = false;
|
||||||
if (allTags.contains (tagName) && !insideCodeTag) {
|
if (allTags.contains (tagName) && !insideCodeTag) {
|
||||||
insideTag = true;
|
insideHtmlTag = insideTag = true;
|
||||||
ret.append ('<');
|
htmlQuoteChar = '\u0000';
|
||||||
// set ignoreNewline on some tags, depending on wheather they're
|
// set ignoreNewline on some tags, depending on wheather they're
|
||||||
// being opened or closed.
|
// being opened or closed.
|
||||||
// what's going on here? we switch newline encoding on inside some tags, for
|
// what's going on here? we switch newline encoding on inside some tags, for
|
||||||
|
@ -391,31 +399,60 @@ public final class HtmlEncoder {
|
||||||
}
|
}
|
||||||
if ("code".equals (tagName) && !insideCloseTag)
|
if ("code".equals (tagName) && !insideCloseTag)
|
||||||
insideCodeTag = true;
|
insideCodeTag = true;
|
||||||
break;
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // if (i < l-2)
|
} // if (i < l-2)
|
||||||
|
if (insideTag)
|
||||||
|
ret.append ('<');
|
||||||
|
else
|
||||||
ret.append ("<");
|
ret.append ("<");
|
||||||
break;
|
break;
|
||||||
|
case '"':
|
||||||
|
case '\'':
|
||||||
|
ret.append (c);
|
||||||
|
if (!insideComment) {
|
||||||
|
if (insideMacroTag) {
|
||||||
|
if (macroQuoteChar == c)
|
||||||
|
macroQuoteChar = '\u0000';
|
||||||
|
else if (macroQuoteChar == '\u0000')
|
||||||
|
macroQuoteChar = c;
|
||||||
|
} else if (insideHtmlTag) {
|
||||||
|
if (htmlQuoteChar == c)
|
||||||
|
htmlQuoteChar = '\u0000';
|
||||||
|
else if (htmlQuoteChar == '\u0000')
|
||||||
|
htmlQuoteChar = c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
case '\n':
|
case '\n':
|
||||||
ret.append ('\n');
|
ret.append ('\n');
|
||||||
if (!insideTag && !ignoreNewline && !swallowOneNewline)
|
if (!insideTag) {
|
||||||
|
if (!ignoreNewline && !swallowOneNewline)
|
||||||
ret.append ("<br />");
|
ret.append ("<br />");
|
||||||
if (!insideTag)
|
|
||||||
swallowOneNewline = false;
|
swallowOneNewline = false;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case '>':
|
case '>':
|
||||||
if (insideTag) {
|
// For Helma macro tags and comments, we overrule tag balancing,
|
||||||
|
// i.e. we don't require that '<' and '>' be balanced within
|
||||||
|
// macros and comments. Rather, we check for the matching closing tag.
|
||||||
|
if (insideComment) {
|
||||||
ret.append ('>');
|
ret.append ('>');
|
||||||
if (insideMacroTag)
|
insideComment = !(str.charAt(i-2) == '-' && str.charAt(i-1) == '-');
|
||||||
insideMacroTag = insideTag = !(chars[i-1] == '%');
|
} else if (insideMacroTag) {
|
||||||
else if (insideComment)
|
ret.append ('>');
|
||||||
insideComment = insideTag = !(chars[i-2] == '-' && chars[i-1] == '-');
|
insideMacroTag = !(str.charAt(i-1) == '%' && macroQuoteChar == '\u0000');
|
||||||
else
|
} else if (insideHtmlTag) {
|
||||||
insideTag = false;
|
ret.append ('>');
|
||||||
|
// only leave HTML tag if quotation marks are balanced
|
||||||
|
// within that tag.
|
||||||
|
insideHtmlTag = htmlQuoteChar != '\u0000';
|
||||||
} else {
|
} else {
|
||||||
ret.append (">");
|
ret.append (">");
|
||||||
}
|
}
|
||||||
|
// check if we still are inside any kind of tag
|
||||||
|
insideTag = insideComment || insideMacroTag || insideHtmlTag;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
// ret.append (c);
|
// ret.append (c);
|
||||||
|
@ -428,7 +465,7 @@ public final class HtmlEncoder {
|
||||||
ret.append ((int) c);
|
ret.append ((int) c);
|
||||||
ret.append (";");
|
ret.append (";");
|
||||||
}
|
}
|
||||||
if (!insideTag && !Character.isWhitespace (c))
|
if (swallowOneNewline && !insideTag && !Character.isWhitespace (c))
|
||||||
swallowOneNewline = false;
|
swallowOneNewline = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue