From 7f9c5d65c0c3fa053278d3b971d1c0b7afb3d2fe Mon Sep 17 00:00:00 2001 From: hns Date: Tue, 5 Nov 2002 18:52:30 +0000 Subject: [PATCH] Reworked encode() method: The hierarchy of HTML tags, macro tags and HTML comments is now right. Also made the method perform a bit faster by not creating in intermediate char array. Fixes bug 152. --- src/helma/util/HtmlEncoder.java | 175 +++++++++++++++++++------------- 1 file changed, 106 insertions(+), 69 deletions(-) diff --git a/src/helma/util/HtmlEncoder.java b/src/helma/util/HtmlEncoder.java index bcdb48d1..987ff043 100644 --- a/src/helma/util/HtmlEncoder.java +++ b/src/helma/util/HtmlEncoder.java @@ -294,50 +294,61 @@ public final class HtmlEncoder { } /** - * + * Do "smart" encodging on a string. This means that valid HTML entities and tags, + * Helma macros and HTML comments are passed through unescaped, while + * other occurrences of '<', '>' and '&' are encoded to HTML entities. */ public final static void encode (String str, StringBuffer ret) { if (str == null) return; - char[] chars = str.toCharArray (); - int l = chars.length; + int l = str.length(); - // are we currently within a < and a >? - boolean insideTag=false; + // are we currently within a < and a > that consitute some kind of tag? + // we use tag balancing to know whether we are inside a tag (and should + // pass things through unchanged) or outside (and should encode stuff). + boolean insideTag = false; + // are we inside an HTML tag? + boolean insideHtmlTag = false; // if we are inside a tag, we encode everything to make // documentation work easier boolean insideCodeTag = false; - // are we within a macro tag? + // are we within a Helma <% macro %> tag? We treat macro tags and + // comments specially, since we can't rely on tag balancing + // to know when we leave a macro tag or comment. boolean insideMacroTag = false; // are we inside an HTML comment? boolean insideComment = false; + // the quotation mark we are in within an HTML or Macro tag, if any + char htmlQuoteChar = '\u0000'; + char macroQuoteChar = '\u0000'; // the difference between swallowOneNewline and ignoreNewline is that // swallowOneNewline is just effective once (for the next newline) boolean ignoreNewline = false; boolean swallowOneNewline = false; for (int i=0; i tagStart && j < l) { - String tagName = new String (chars, tagStart, j-tagStart).toLowerCase (); - if ("code".equals (tagName) && insideCloseTag && insideCodeTag) - insideCodeTag = false; - if (allTags.contains (tagName) && !insideCodeTag) { - insideTag = true; - ret.append ('<'); - // set ignoreNewline on some tags, depending on wheather they're - // being opened or closed. - // what's going on here? we switch newline encoding on inside some tags, for - // others we switch it on when they're closed - if (encodeLinebreakTags.contains (tagName)) { - ignoreNewline = insideCloseTag; - swallowOneNewline = true; - } else if (suppressLinebreakTags.contains (tagName)) { - ignoreNewline = !insideCloseTag; - swallowOneNewline = true; - } else if ("p".equals (tagName) || "blockquote".equals (tagName) || "bq".equals (tagName)) { - swallowOneNewline = true; + if (i < l-2) { + if (!insideMacroTag && '%' == str.charAt(i+1)) { + // this is the beginning of a Helma macro tag + insideMacroTag = insideTag = true; + macroQuoteChar = '\u0000'; + } else if ('!' == str.charAt(i+1) && '-' == str.charAt(i+2)) { + // the beginning of an HTML comment? + insideComment = insideTag = (i tagStart && j < l) { + String tagName = str.substring (tagStart, j).toLowerCase(); + if ("code".equals (tagName) && insideCloseTag && insideCodeTag) + insideCodeTag = false; + if (allTags.contains (tagName) && !insideCodeTag) { + insideHtmlTag = insideTag = true; + htmlQuoteChar = '\u0000'; + // set ignoreNewline on some tags, depending on wheather they're + // being opened or closed. + // what's going on here? we switch newline encoding on inside some tags, for + // others we switch it on when they're closed + if (encodeLinebreakTags.contains (tagName)) { + ignoreNewline = insideCloseTag; + swallowOneNewline = true; + } else if (suppressLinebreakTags.contains (tagName)) { + ignoreNewline = !insideCloseTag; + swallowOneNewline = true; + } else if ("p".equals (tagName) || "blockquote".equals (tagName) || "bq".equals (tagName)) { + swallowOneNewline = true; + } + if ("code".equals (tagName) && !insideCloseTag) + insideCodeTag = true; } - if ("code".equals (tagName) && !insideCloseTag) - insideCodeTag = true; - break; } } } // if (i < l-2) - ret.append ("<"); + if (insideTag) + ret.append ('<'); + else + ret.append ("<"); + break; + case '"': + case '\'': + ret.append (c); + if (!insideComment) { + if (insideMacroTag) { + if (macroQuoteChar == c) + macroQuoteChar = '\u0000'; + else if (macroQuoteChar == '\u0000') + macroQuoteChar = c; + } else if (insideHtmlTag) { + if (htmlQuoteChar == c) + htmlQuoteChar = '\u0000'; + else if (htmlQuoteChar == '\u0000') + htmlQuoteChar = c; + } + } break; case '\n': ret.append ('\n'); - if (!insideTag && !ignoreNewline && !swallowOneNewline) - ret.append ("
"); - if (!insideTag) + if (!insideTag) { + if (!ignoreNewline && !swallowOneNewline) + ret.append ("
"); swallowOneNewline = false; + } break; case '>': - if (insideTag) { + // For Helma macro tags and comments, we overrule tag balancing, + // i.e. we don't require that '<' and '>' be balanced within + // macros and comments. Rather, we check for the matching closing tag. + if (insideComment) { ret.append ('>'); - if (insideMacroTag) - insideMacroTag = insideTag = !(chars[i-1] == '%'); - else if (insideComment) - insideComment = insideTag = !(chars[i-2] == '-' && chars[i-1] == '-'); - else - insideTag = false; + insideComment = !(str.charAt(i-2) == '-' && str.charAt(i-1) == '-'); + } else if (insideMacroTag) { + ret.append ('>'); + insideMacroTag = !(str.charAt(i-1) == '%' && macroQuoteChar == '\u0000'); + } else if (insideHtmlTag) { + ret.append ('>'); + // only leave HTML tag if quotation marks are balanced + // within that tag. + insideHtmlTag = htmlQuoteChar != '\u0000'; } else { ret.append (">"); } + // check if we still are inside any kind of tag + insideTag = insideComment || insideMacroTag || insideHtmlTag; break; default: // ret.append (c); @@ -428,7 +465,7 @@ public final class HtmlEncoder { ret.append ((int) c); ret.append (";"); } - if (!insideTag && !Character.isWhitespace (c)) + if (swallowOneNewline && !insideTag && !Character.isWhitespace (c)) swallowOneNewline = false; } }