From 962b2b6e6cb860851a0810af9b41c82d351af813 Mon Sep 17 00:00:00 2001 From: hns Date: Fri, 21 Jun 2002 13:41:29 +0000 Subject: [PATCH] HTML encoding is now smarter about encoding &, < and >. If they are part of a valid HTML entity reference, an HTML tag or a Helma macro they are passed through unchanged, otherwise they are encoded to &, < or >. Another new feature that should make writing documentation on Helma easier is that everything is encoded if it is placed within a tag. --- src/helma/util/HtmlEncoder.java | 288 ++++++++++++++++++++++++-------- 1 file changed, 221 insertions(+), 67 deletions(-) diff --git a/src/helma/util/HtmlEncoder.java b/src/helma/util/HtmlEncoder.java index d9b23fa9..bc99301c 100644 --- a/src/helma/util/HtmlEncoder.java +++ b/src/helma/util/HtmlEncoder.java @@ -158,94 +158,248 @@ public final class HtmlEncoder { }; + static final HashSet allTags = new HashSet (); + static { + allTags.add ("a"); + allTags.add ("abbr"); + allTags.add ("address"); + allTags.add ("applet"); + allTags.add ("area"); + allTags.add ("b"); + allTags.add ("base"); + allTags.add ("basefont"); + allTags.add ("bgsound"); + allTags.add ("big"); + allTags.add ("blink"); + allTags.add ("blockquote"); + allTags.add ("bq"); + allTags.add ("body"); + allTags.add ("br"); + allTags.add ("button"); + allTags.add ("caption"); + allTags.add ("center"); + allTags.add ("cite"); + allTags.add ("code"); + allTags.add ("col"); + allTags.add ("colgroup"); + allTags.add ("del"); + allTags.add ("dir"); + allTags.add ("div"); + allTags.add ("dl"); + allTags.add ("dt"); + allTags.add ("dd"); + allTags.add ("em"); + allTags.add ("embed"); + allTags.add ("fieldset"); + allTags.add ("font"); + allTags.add ("form"); + allTags.add ("frame"); + allTags.add ("frameset"); + allTags.add ("h1"); + allTags.add ("h2"); + allTags.add ("h3"); + allTags.add ("h4"); + allTags.add ("h5"); + allTags.add ("h6"); + allTags.add ("head"); + allTags.add ("html"); + allTags.add ("i"); + allTags.add ("iframe"); + allTags.add ("img"); + allTags.add ("input"); + allTags.add ("ins"); + allTags.add ("isindex"); + allTags.add ("kbd"); + allTags.add ("li"); + allTags.add ("link"); + allTags.add ("listing"); + allTags.add ("map"); + allTags.add ("marquee"); + allTags.add ("menu"); + allTags.add ("meta"); + allTags.add ("nobr"); + allTags.add ("noframes"); + allTags.add ("object"); + allTags.add ("ol"); + allTags.add ("option"); + allTags.add ("optgroup"); + allTags.add ("p"); + allTags.add ("param"); + allTags.add ("plaintext"); + allTags.add ("pre"); + allTags.add ("q"); + allTags.add ("samp"); + allTags.add ("script"); + allTags.add ("select"); + allTags.add ("small"); + allTags.add ("span"); + allTags.add ("strike"); + allTags.add ("strong"); + allTags.add ("style"); + allTags.add ("sub"); + allTags.add ("sup"); + allTags.add ("table"); + allTags.add ("tbody"); + allTags.add ("td"); + allTags.add ("textarea"); + allTags.add ("tfoot"); + allTags.add ("th"); + allTags.add ("thead"); + allTags.add ("title"); + allTags.add ("tr"); + allTags.add ("tt"); + allTags.add ("u"); + allTags.add ("ul"); + allTags.add ("var"); + allTags.add ("wbr"); + allTags.add ("xmp"); + allTags.add ("%"); + } + + // tags which signal us to start suppressing \n ->
encoding + // these are "structrural" tags, for example, we don't want to add
s + // between a and a . + static final HashSet suppressLinebreakTags = new HashSet (); + static { + suppressLinebreakTags.add ("table"); + suppressLinebreakTags.add ("ul"); + suppressLinebreakTags.add ("ol"); + suppressLinebreakTags.add ("pre"); + } + + // tags which signal us to stop suppressing \n ->
encoding + // these usually signal transition from structural tags to normal + // HTML text, e.g.
+ static final HashSet encodeLinebreakTags = new HashSet (); + static { + encodeLinebreakTags.add ("td"); + encodeLinebreakTags.add ("th"); + encodeLinebreakTags.add ("li"); + } + /** - * - */ + * + */ public final static String encode (String str) { // try to make stringbuffer large enough from the start StringBuffer ret = new StringBuffer (Math.round (str.length()*1.4f)); encode (str, ret); - return ret.toString(); + return ret.toString(); } - + /** - * - */ + * + */ public final static void encode (String str, StringBuffer ret) { if (str == null) return; - - int l = str.length(); - - boolean closeTag=false, readTag=false, tagOpen=false; - // the difference between swallowOneNewline and ignoreNewline is that swallowOneNewline is just effective once (for the next newline) + + char[] chars = str.toCharArray (); + int l = chars.length; + + // are we currently within a < and a >? + boolean insideTag=false; + // if we are inside a tag, we encode everything to make + // documentation work easier + boolean insideCodeTag = false; + // the difference between swallowOneNewline and ignoreNewline is that + // swallowOneNewline is just effective once (for the next newline) boolean ignoreNewline = false; boolean swallowOneNewline = false; - StringBuffer tag = new StringBuffer (); - + for (int i=0; i"); - if (!tagOpen) - swallowOneNewline = false; + case '&': + // check if this is an HTML entity already, in which case we pass it though unchanged + if (i < l-4 && !insideCodeTag) { + // is this a numeric entity? + if (chars[i+1] == '#' ) { + int j = i+2; + while (j tagStart && j < l) { + String tagName = new String (chars, tagStart, j-tagStart).toLowerCase (); + if ("code".equals (tagName) && insideCloseTag && insideCodeTag) + insideCodeTag = false; + if (allTags.contains (tagName) && !insideCodeTag) { + insideTag = true; + ret.append ('<'); + // set ignoreNewline on some tags, depending on wheather they're + // being opened or closed. + // what's going on here? we switch newline encoding on inside some tags, for + // others we switch it on when they're closed + if (encodeLinebreakTags.contains (tagName)) { + ignoreNewline = insideCloseTag; + swallowOneNewline = true; + } else if (suppressLinebreakTags.contains (tagName)) { + ignoreNewline = !insideCloseTag; + swallowOneNewline = true; + } else if ("p".equalsIgnoreCase (tagName) || + "blockquote".equalsIgnoreCase (tagName) || + "bq".equalsIgnoreCase (tagName)) { + swallowOneNewline = true; + } + if ("code".equals (tagName) && !insideCloseTag) + insideCodeTag = true; + break; + } + } + } // if (i < l-2) + ret.append ("<"); + break; + case '\n': + ret.append ('\n'); + if (!insideTag && !ignoreNewline && !swallowOneNewline) + ret.append ("
"); + if (!insideTag) + swallowOneNewline = false; break; case '>': - tagOpen = false; - ret.append ('>'); + if (insideTag) + ret.append ('>'); + else + ret.append (">"); + insideTag = false; break; default: - // ret.append (c); - if (c < 128) - ret.append (c); - else if (c >= 128 && c < 256) - ret.append (transform[c-128]); - else { - ret.append ("&#"); - ret.append ((int) c); - ret.append (";"); - } - if (!tagOpen && !Character.isWhitespace (c)) - swallowOneNewline = false; + // ret.append (c); + if (c < 128) + ret.append (c); + else if (c >= 128 && c < 256) + ret.append (transform[c-128]); + else { + ret.append ("&#"); + ret.append ((int) c); + ret.append (";"); + } + if (!insideTag && !Character.isWhitespace (c)) + swallowOneNewline = false; } } }