HTML encoding is now smarter about encoding &, < and >.

If they are part of a valid HTML entity reference, an HTML tag or
a Helma macro they are passed through unchanged, otherwise
they are encoded to &amp;, &lt; or &gt;.

Another new feature that should make writing documentation on
Helma easier is that everything is encoded if it is placed within
a <code> tag.
This commit is contained in:
hns 2002-06-21 13:41:29 +00:00
parent 52a97b1a46
commit 962b2b6e6c

View file

@ -158,94 +158,248 @@ public final class HtmlEncoder {
}; };
static final HashSet allTags = new HashSet ();
static {
allTags.add ("a");
allTags.add ("abbr");
allTags.add ("address");
allTags.add ("applet");
allTags.add ("area");
allTags.add ("b");
allTags.add ("base");
allTags.add ("basefont");
allTags.add ("bgsound");
allTags.add ("big");
allTags.add ("blink");
allTags.add ("blockquote");
allTags.add ("bq");
allTags.add ("body");
allTags.add ("br");
allTags.add ("button");
allTags.add ("caption");
allTags.add ("center");
allTags.add ("cite");
allTags.add ("code");
allTags.add ("col");
allTags.add ("colgroup");
allTags.add ("del");
allTags.add ("dir");
allTags.add ("div");
allTags.add ("dl");
allTags.add ("dt");
allTags.add ("dd");
allTags.add ("em");
allTags.add ("embed");
allTags.add ("fieldset");
allTags.add ("font");
allTags.add ("form");
allTags.add ("frame");
allTags.add ("frameset");
allTags.add ("h1");
allTags.add ("h2");
allTags.add ("h3");
allTags.add ("h4");
allTags.add ("h5");
allTags.add ("h6");
allTags.add ("head");
allTags.add ("html");
allTags.add ("i");
allTags.add ("iframe");
allTags.add ("img");
allTags.add ("input");
allTags.add ("ins");
allTags.add ("isindex");
allTags.add ("kbd");
allTags.add ("li");
allTags.add ("link");
allTags.add ("listing");
allTags.add ("map");
allTags.add ("marquee");
allTags.add ("menu");
allTags.add ("meta");
allTags.add ("nobr");
allTags.add ("noframes");
allTags.add ("object");
allTags.add ("ol");
allTags.add ("option");
allTags.add ("optgroup");
allTags.add ("p");
allTags.add ("param");
allTags.add ("plaintext");
allTags.add ("pre");
allTags.add ("q");
allTags.add ("samp");
allTags.add ("script");
allTags.add ("select");
allTags.add ("small");
allTags.add ("span");
allTags.add ("strike");
allTags.add ("strong");
allTags.add ("style");
allTags.add ("sub");
allTags.add ("sup");
allTags.add ("table");
allTags.add ("tbody");
allTags.add ("td");
allTags.add ("textarea");
allTags.add ("tfoot");
allTags.add ("th");
allTags.add ("thead");
allTags.add ("title");
allTags.add ("tr");
allTags.add ("tt");
allTags.add ("u");
allTags.add ("ul");
allTags.add ("var");
allTags.add ("wbr");
allTags.add ("xmp");
allTags.add ("%");
}
// tags which signal us to start suppressing \n -> <br> encoding
// these are "structrural" tags, for example, we don't want to add <br>s
// between a <table> and a <tr>.
static final HashSet suppressLinebreakTags = new HashSet ();
static {
suppressLinebreakTags.add ("table");
suppressLinebreakTags.add ("ul");
suppressLinebreakTags.add ("ol");
suppressLinebreakTags.add ("pre");
}
// tags which signal us to stop suppressing \n -> <br> encoding
// these usually signal transition from structural tags to normal
// HTML text, e.g. <td>
static final HashSet encodeLinebreakTags = new HashSet ();
static {
encodeLinebreakTags.add ("td");
encodeLinebreakTags.add ("th");
encodeLinebreakTags.add ("li");
}
/** /**
* *
*/ */
public final static String encode (String str) { public final static String encode (String str) {
// try to make stringbuffer large enough from the start // try to make stringbuffer large enough from the start
StringBuffer ret = new StringBuffer (Math.round (str.length()*1.4f)); StringBuffer ret = new StringBuffer (Math.round (str.length()*1.4f));
encode (str, ret); encode (str, ret);
return ret.toString(); return ret.toString();
} }
/** /**
* *
*/ */
public final static void encode (String str, StringBuffer ret) { public final static void encode (String str, StringBuffer ret) {
if (str == null) if (str == null)
return; return;
int l = str.length(); char[] chars = str.toCharArray ();
int l = chars.length;
boolean closeTag=false, readTag=false, tagOpen=false;
// the difference between swallowOneNewline and ignoreNewline is that swallowOneNewline is just effective once (for the next newline) // are we currently within a < and a >?
boolean insideTag=false;
// if we are inside a <code> tag, we encode everything to make
// documentation work easier
boolean insideCodeTag = false;
// the difference between swallowOneNewline and ignoreNewline is that
// swallowOneNewline is just effective once (for the next newline)
boolean ignoreNewline = false; boolean ignoreNewline = false;
boolean swallowOneNewline = false; boolean swallowOneNewline = false;
StringBuffer tag = new StringBuffer ();
for (int i=0; i<l; i++) { for (int i=0; i<l; i++) {
char c = str.charAt (i); char c = chars[i];
if (readTag) {
if (Character.isLetterOrDigit (c))
tag.append (c);
else if ('/' == c)
closeTag = true;
else {
String t = tag.toString ();
// set ignoreNewline on some tags, depending on wheather they're
// being opened or closed.
// what's going on here? we switch newline encoding on inside some tags, for
// others we switch it on when they're closed
if ("td".equalsIgnoreCase (t) || "th".equalsIgnoreCase (t) || "li".equalsIgnoreCase (t)) {
ignoreNewline = closeTag;
swallowOneNewline = true;
} else if ("table".equalsIgnoreCase (t) || "ul".equalsIgnoreCase (t) || "ol".equalsIgnoreCase (t) || "pre".equalsIgnoreCase (t)) {
ignoreNewline = !closeTag;
swallowOneNewline = true;
} else if ("p".equalsIgnoreCase (t)) {
swallowOneNewline = true;
}
readTag = false;
closeTag = false;
tag.setLength (0);
}
} // if (readTag)
switch (c) { switch (c) {
// case '&': case '&':
// ret.append ("&amp;"); // check if this is an HTML entity already, in which case we pass it though unchanged
// break; if (i < l-4 && !insideCodeTag) {
case '\n': // is this a numeric entity?
ret.append ('\n'); if (chars[i+1] == '#' ) {
if (!ignoreNewline && !swallowOneNewline) int j = i+2;
ret.append ("<br />"); while (j<l && Character.isDigit (chars[j]))
if (!tagOpen) j++;
swallowOneNewline = false; if (j<l && chars[j] == ';') {
ret.append ("&");
break;
}
} else {
int j = i+1;
while (j<l && Character.isLetterOrDigit (chars[j]))
j++;
if (j<l && chars[j] == ';') {
ret.append ("&");
break;
}
}
}
// we didn't reach a break, so encode the ampersand as HTML entity
ret.append ("&amp;");
break; break;
case '<': case '<':
closeTag = false; if (i < l-2) {
readTag = true; boolean insideCloseTag = ('/' == chars[i+1]);
tagOpen = true; int tagStart = insideCloseTag ? i+2 : i+1;
ret.append ('<'); int j = tagStart;
while (j<l && (Character.isLetterOrDigit (chars[j]) || chars[j] == '%'))
j++;
if (j > tagStart && j < l) {
String tagName = new String (chars, tagStart, j-tagStart).toLowerCase ();
if ("code".equals (tagName) && insideCloseTag && insideCodeTag)
insideCodeTag = false;
if (allTags.contains (tagName) && !insideCodeTag) {
insideTag = true;
ret.append ('<');
// set ignoreNewline on some tags, depending on wheather they're
// being opened or closed.
// what's going on here? we switch newline encoding on inside some tags, for
// others we switch it on when they're closed
if (encodeLinebreakTags.contains (tagName)) {
ignoreNewline = insideCloseTag;
swallowOneNewline = true;
} else if (suppressLinebreakTags.contains (tagName)) {
ignoreNewline = !insideCloseTag;
swallowOneNewline = true;
} else if ("p".equalsIgnoreCase (tagName) ||
"blockquote".equalsIgnoreCase (tagName) ||
"bq".equalsIgnoreCase (tagName)) {
swallowOneNewline = true;
}
if ("code".equals (tagName) && !insideCloseTag)
insideCodeTag = true;
break;
}
}
} // if (i < l-2)
ret.append ("&lt;");
break;
case '\n':
ret.append ('\n');
if (!insideTag && !ignoreNewline && !swallowOneNewline)
ret.append ("<br />");
if (!insideTag)
swallowOneNewline = false;
break; break;
case '>': case '>':
tagOpen = false; if (insideTag)
ret.append ('>'); ret.append ('>');
else
ret.append ("&gt;");
insideTag = false;
break; break;
default: default:
// ret.append (c); // ret.append (c);
if (c < 128) if (c < 128)
ret.append (c); ret.append (c);
else if (c >= 128 && c < 256) else if (c >= 128 && c < 256)
ret.append (transform[c-128]); ret.append (transform[c-128]);
else { else {
ret.append ("&#"); ret.append ("&#");
ret.append ((int) c); ret.append ((int) c);
ret.append (";"); ret.append (";");
} }
if (!tagOpen && !Character.isWhitespace (c)) if (!insideTag && !Character.isWhitespace (c))
swallowOneNewline = false; swallowOneNewline = false;
} }
} }
} }