diff --git a/src/helma/util/HtmlEncoder.java b/src/helma/util/HtmlEncoder.java index 2bc5f09f..dcf971f4 100644 --- a/src/helma/util/HtmlEncoder.java +++ b/src/helma/util/HtmlEncoder.java @@ -263,29 +263,58 @@ public final class HtmlEncoder { allTags.add ("xmp"); } - // tags which signal us to start suppressing \n ->
encoding - // these are "structrural" tags, for example, we don't want to add
s - // between a and a . - static final HashSet suppressLinebreakTags = new HashSet (); + + // HTML block tags need to suppress automatic newline to
+ // conversion around them to look good. However, they differ + // in how many newlines around them should ignored. These sets + // help to treat each tag right in newline conversion. + static final HashSet swallowAll = new HashSet (); + static final HashSet swallowTwo = new HashSet (); + static final HashSet swallowOne = new HashSet (); static { - suppressLinebreakTags.add ("table"); - suppressLinebreakTags.add ("ul"); - suppressLinebreakTags.add ("ol"); - suppressLinebreakTags.add ("pre"); + // actual block level elements + swallowOne.add ("address"); + swallowTwo.add ("blockquote"); + swallowTwo.add ("center"); + swallowOne.add ("dir"); + swallowOne.add ("div"); + swallowTwo.add ("dl"); + swallowTwo.add ("fieldset"); + swallowTwo.add ("form"); + swallowTwo.add ("h1"); + swallowTwo.add ("h2"); + swallowTwo.add ("h3"); + swallowTwo.add ("h4"); + swallowTwo.add ("h5"); + swallowTwo.add ("h6"); + swallowTwo.add ("hr"); + swallowTwo.add ("isindex"); + swallowAll.add ("menu"); + swallowAll.add ("noframes"); + swallowAll.add ("noscript"); + swallowTwo.add ("ol"); + swallowTwo.add ("p"); + swallowTwo.add ("pre"); + swallowOne.add ("table"); + swallowTwo.add ("ul"); + /// to be treated as block level elements + swallowTwo.add ("dd"); + swallowTwo.add ("dt"); + swallowTwo.add ("frameset"); + swallowTwo.add ("li"); + swallowAll.add ("tbody"); + swallowTwo.add ("td"); + swallowAll.add ("tfoot"); + swallowOne.add ("th"); + swallowAll.add ("thead"); + swallowAll.add ("tr"); } - // tags which signal us to stop suppressing \n ->
encoding - // these usually signal transition from structural tags to normal - // HTML text, e.g.
- static final HashSet encodeLinebreakTags = new HashSet (); - static { - encodeLinebreakTags.add ("td"); - encodeLinebreakTags.add ("th"); - encodeLinebreakTags.add ("li"); - } /** - * + * Do "smart" encodging on a string. This means that valid HTML entities and tags, + * Helma macros and HTML comments are passed through unescaped, while + * other occurrences of '<', '>' and '&' are encoded to HTML entities. */ public final static String encode (String str) { if (str == null) @@ -295,7 +324,7 @@ public final class HtmlEncoder { return ""; // try to make stringbuffer large enough from the start StringBuffer ret = new StringBuffer (Math.round (l*1.4f)); - encode (str, ret); + encode (str, ret, null); return ret.toString(); } @@ -305,11 +334,21 @@ public final class HtmlEncoder { * other occurrences of '<', '>' and '&' are encoded to HTML entities. */ public final static void encode (String str, StringBuffer ret) { + encode (str, ret, null); + } + + /** + * Do "smart" encodging on a string. This means that valid HTML entities and tags, + * Helma macros and HTML comments are passed through unescaped, while + * other occurrences of '<', '>' and '&' are encoded to HTML entities. + */ + public final static void encode (String str, StringBuffer ret, Set allowedTags) { if (str == null) return; int l = str.length(); + Stack openTags = new Stack(); // are we currently within a < and a > that consitute some kind of tag? // we use tag balancing to know whether we are inside a tag (and should // pass things through unchanged) or outside (and should encode stuff). @@ -319,6 +358,7 @@ public final class HtmlEncoder { // if we are inside a tag, we encode everything to make // documentation work easier boolean insideCodeTag = false; + boolean insidePreTag = false; // are we within a Helma <% macro %> tag? We treat macro tags and // comments specially, since we can't rely on tag balancing // to know when we leave a macro tag or comment. @@ -328,17 +368,104 @@ public final class HtmlEncoder { // the quotation mark we are in within an HTML or Macro tag, if any char htmlQuoteChar = '\u0000'; char macroQuoteChar = '\u0000'; - // the difference between swallowOneNewline and ignoreNewline is that - // swallowOneNewline is just effective once (for the next newline) - boolean ignoreNewline = false; - boolean swallowOneNewline = false; + // number of newlines to ignore in \n ->
conversion + int swallowLinebreaks = 0; + // number of newlines met since the last non-whitespace character + int linebreaks = 0; // did we meet a backslash escape? boolean escape = false; for (int i=0; i tagStart && j < l) { + String tagName = str.substring (tagStart, j).toLowerCase(); + if ("code".equals (tagName) && insideCloseTag && insideCodeTag) + insideCodeTag = false; + if ((allowedTags == null || allowedTags.contains (tagName)) && + allTags.contains (tagName) && !insideCodeTag) { + insideHtmlTag = insideTag = true; + htmlQuoteChar = '\u0000'; + // set ignoreNewline on some tags, depending on wheather they're + // being opened or closed. + // what's going on here? we switch newline encoding on inside some tags, for + // others we switch it on when they're closed + linebreaks = Math.max(linebreaks-swallowLinebreaks, 0); + if (swallowAll.contains (tagName)) { + swallowLinebreaks = 1000; + } else if (swallowTwo.contains (tagName)) { + swallowLinebreaks = 2; + } else if (swallowOne.contains (tagName)) { + swallowLinebreaks = 1; + } else { + swallowLinebreaks = 0; + } + if (insideCloseTag) { + int t = openTags.search (tagName); + if (t == -1) { + i = j; + insideHtmlTag = insideTag = false; + continue; + } else if (t > 1) { + for (int k=1; k"); + } + } + openTags.pop (); + } else { + openTags.push (tagName); + swallowLinebreaks = Math.max (swallowLinebreaks-1, 0); + } + if ("code".equals (tagName) && !insideCloseTag) + insideCodeTag = true; + if ("pre".equals (tagName)) + insidePreTag = !insideCloseTag; + } + } + } + } // if (i < l-2) + } + if (linebreaks > 0 && !Character.isWhitespace(c)) { + if (!insidePreTag && linebreaks > swallowLinebreaks) { + linebreaks -= swallowLinebreaks; + for (int k=0; k"); + } + if (!insideTag) + swallowLinebreaks = 0; + linebreaks = 0; + } + switch (c) { + case '<': + if (insideTag) + ret.append ('<'); + else + ret.append ("<"); + break; case '&': // check if this is an HTML entity already, // in which case we pass it though unchanged @@ -365,56 +492,6 @@ public final class HtmlEncoder { // we didn't reach a break, so encode the ampersand as HTML entity ret.append ("&"); break; - case '<': - if (i < l-2) { - if (!insideMacroTag && '%' == str.charAt(i+1)) { - // this is the beginning of a Helma macro tag - if (!insideCodeTag) { - insideMacroTag = insideTag = true; - macroQuoteChar = '\u0000'; - } - } else if ('!' == str.charAt(i+1) && '-' == str.charAt(i+2)) { - // the beginning of an HTML comment? - if (!insideCodeTag) - insideComment = insideTag = (i tagStart && j < l) { - String tagName = str.substring (tagStart, j).toLowerCase(); - if ("code".equals (tagName) && insideCloseTag && insideCodeTag) - insideCodeTag = false; - if (allTags.contains (tagName) && !insideCodeTag) { - insideHtmlTag = insideTag = true; - htmlQuoteChar = '\u0000'; - // set ignoreNewline on some tags, depending on wheather they're - // being opened or closed. - // what's going on here? we switch newline encoding on inside some tags, for - // others we switch it on when they're closed - if (encodeLinebreakTags.contains (tagName)) { - ignoreNewline = insideCloseTag; - swallowOneNewline = true; - } else if (suppressLinebreakTags.contains (tagName)) { - ignoreNewline = !insideCloseTag; - swallowOneNewline = true; - } else if ("p".equals (tagName) || "blockquote".equals (tagName) || "bq".equals (tagName)) { - swallowOneNewline = true; - } - if ("code".equals (tagName) && !insideCloseTag) - insideCodeTag = true; - } - } - } - } // if (i < l-2) - if (insideTag) - ret.append ('<'); - else - ret.append ("<"); - break; case '\\': ret.append (c); if (insideTag && !insideComment) @@ -444,11 +521,8 @@ public final class HtmlEncoder { break; case '\n': ret.append ('\n'); - if (!insideTag) { - if (!ignoreNewline && !swallowOneNewline) - ret.append ("
"); - swallowOneNewline = false; - } + if (!insideTag) + linebreaks++; break; case '>': // For Helma macro tags and comments, we overrule tag balancing, @@ -465,6 +539,11 @@ public final class HtmlEncoder { // only leave HTML tag if quotation marks are balanced // within that tag. insideHtmlTag = htmlQuoteChar != '\u0000'; + // Check if this is an empty tag so we don't generate an + // additional tag. + if (str.charAt(i-1) == '/') { + openTags.pop(); + } } else { ret.append (">"); } @@ -482,11 +561,18 @@ public final class HtmlEncoder { ret.append ((int) c); ret.append (";"); } - if (swallowOneNewline && !insideTag && !Character.isWhitespace (c)) - swallowOneNewline = false; escape = false; } } + // if tags were opened but not closed, close them. + int o = openTags.size(); + if (o > 0) { + for (int k=0; k"); + } + } }