Enhancements in Helma format() functions:

* implement tag balancing: - unclosed tags are closed when necessary (either when an enclosing tag is closed or when all text is processed and tags are still open) - closed tags that were not opened are dropped * be smart in newline-to-<br> conversion: when an HTML block element is met, some or all newline characters are excluded from conversion. * implemented "allowed tags" feature, all other tags are escaped. This is not yet exported to the JavaScript API, though.
2003-03-21 18:17:23 +00:00 · 2003-03-21 18:17:23 +00:00 · f8123e7934
commit f8123e7934
parent 7763b6ac51
1 changed files with 166 additions and 80 deletions
--- a/src/helma/util/HtmlEncoder.java
+++ b/src/helma/util/HtmlEncoder.java
@ -263,29 +263,58 @@ public final class HtmlEncoder {
 	allTags.add ("xmp");
    }
-    // tags which signal us to start suppressing \n -> <br> encoding
+
-    // these are "structrural" tags, for example, we don't want to add <br>s 
+    // HTML block tags need to suppress automatic newline to <br>
-    // between a <table> and a <tr>.
+    // conversion around them to look good. However, they differ 
-    static final HashSet suppressLinebreakTags = new HashSet ();
+    // in how many newlines around them should ignored. These sets
    // help to treat each tag right in newline conversion.
    static final HashSet swallowAll = new HashSet ();
    static final HashSet swallowTwo = new HashSet ();
    static final HashSet swallowOne = new HashSet ();
    static {
-	suppressLinebreakTags.add ("table");
+	// actual block level elements
-	suppressLinebreakTags.add ("ul");
+	swallowOne.add ("address");
-	suppressLinebreakTags.add ("ol");
+	swallowTwo.add ("blockquote");
-	suppressLinebreakTags.add ("pre");
+	swallowTwo.add ("center");
 	swallowOne.add ("dir");
 	swallowOne.add ("div");
 	swallowTwo.add ("dl");
 	swallowTwo.add ("fieldset");
 	swallowTwo.add ("form");
 	swallowTwo.add ("h1");
 	swallowTwo.add ("h2");
 	swallowTwo.add ("h3");
 	swallowTwo.add ("h4");
 	swallowTwo.add ("h5");
 	swallowTwo.add ("h6");
 	swallowTwo.add ("hr");
 	swallowTwo.add ("isindex");
 	swallowAll.add ("menu");
 	swallowAll.add ("noframes");
 	swallowAll.add ("noscript");
 	swallowTwo.add ("ol");
 	swallowTwo.add ("p");
 	swallowTwo.add ("pre");
 	swallowOne.add ("table");
 	swallowTwo.add ("ul");
 	/// to be treated as block level elements
 	swallowTwo.add ("dd");
 	swallowTwo.add ("dt");
 	swallowTwo.add ("frameset");
 	swallowTwo.add ("li");
 	swallowAll.add ("tbody");
 	swallowTwo.add ("td");
 	swallowAll.add ("tfoot");
 	swallowOne.add ("th");
 	swallowAll.add ("thead");
 	swallowAll.add ("tr");
    }
    // tags which signal us to stop suppressing \n -> <br> encoding
    // these usually signal transition from structural tags to normal
    // HTML text, e.g. <td>
    static final HashSet encodeLinebreakTags = new HashSet ();
    static {
 	encodeLinebreakTags.add ("td");
 	encodeLinebreakTags.add ("th");
 	encodeLinebreakTags.add ("li");
    }
    /**
-     *
+     *  Do "smart" encodging on a string. This means that valid HTML entities and tags,
     *  Helma macros and HTML comments are passed through unescaped, while
     *  other occurrences of '<', '>' and '&' are encoded to HTML entities.
     */
    public final static String encode (String str) {
 	if  (str == null)
@ -295,7 +324,7 @@ public final class HtmlEncoder {
 	    return "";
 	// try to make stringbuffer large enough from the start
 	StringBuffer ret = new StringBuffer (Math.round (l*1.4f));
-	encode (str, ret);
+	encode (str, ret, null);
 	return ret.toString();
    }
@ -305,11 +334,21 @@ public final class HtmlEncoder {
     *  other occurrences of '<', '>' and '&' are encoded to HTML entities.
     */
    public final static void encode (String str, StringBuffer ret) {
 	encode (str, ret, null);
    }
    /**
     *  Do "smart" encodging on a string. This means that valid HTML entities and tags,
     *  Helma macros and HTML comments are passed through unescaped, while
     *  other occurrences of '<', '>' and '&' are encoded to HTML entities.
     */
    public final static void encode (String str, StringBuffer ret, Set allowedTags) {
 	if  (str == null)
 	    return;
 	int l = str.length();
 	Stack openTags = new Stack();
 	// are we currently within a < and a > that consitute some kind of tag?
 	// we use tag balancing to know whether we are inside a tag (and should
 	// pass things through unchanged) or outside (and should encode stuff).
@ -319,6 +358,7 @@ public final class HtmlEncoder {
 	// if we are inside a <code> tag, we encode everything to make
 	// documentation work easier
 	boolean insideCodeTag = false;
 	boolean insidePreTag = false;
 	// are we within a Helma <% macro %> tag? We treat macro tags and
 	// comments specially, since we can't rely on tag balancing
 	// to know when we leave a macro tag or comment.
@ -328,17 +368,104 @@ public final class HtmlEncoder {
 	// the quotation mark we are in within an HTML or Macro tag, if any
 	char htmlQuoteChar = '\u0000';
 	char macroQuoteChar = '\u0000';
-	// the difference between swallowOneNewline and ignoreNewline is that
+	// number of newlines to ignore in \n -> <br> conversion
-	// swallowOneNewline is just effective once (for the next newline)
+	int swallowLinebreaks = 0;
-	boolean ignoreNewline = false;
+	// number of newlines met since the last non-whitespace character
-	boolean swallowOneNewline = false;
+	int linebreaks = 0;
 	// did we meet a backslash escape?
 	boolean escape = false;
 	for (int i=0; i<l; i++) {
 	    char c = str.charAt(i);
 	    // step one: check if this is the beginning of an HTML tag, comment or
 	    // Helma macro.
 	    if (c == '<') {
 	            if (i < l-2) {
 	                if (!insideMacroTag && '%' == str.charAt(i+1)) {
 	                    // this is the beginning of a Helma macro tag
 	                    if (!insideCodeTag) {
 	                        insideMacroTag = insideTag = true;
 	                        macroQuoteChar = '\u0000';
 	                    }
 	                } else if ('!' == str.charAt(i+1) && '-' == str.charAt(i+2)) {
 	                    // the beginning of an HTML comment?
 	                    if (!insideCodeTag)
 	                        insideComment = insideTag = (i<l-3 && '-' == str.charAt(i+3));
 	                } else if (!insideTag) {
 	                    // check if this is a HTML tag.
 	                    boolean insideCloseTag = ('/' == str.charAt(i+1));
 	                    int tagStart = insideCloseTag ? i+2 : i+1;
 	                    int j = tagStart;
 	                    while (j<l && Character.isLetterOrDigit (str.charAt(j)))
 	                        j++;
 	                    if (j > tagStart && j < l) {
 	                        String tagName = str.substring (tagStart, j).toLowerCase();
 	                        if ("code".equals (tagName) && insideCloseTag && insideCodeTag)
 	                            insideCodeTag = false;
 	                        if ((allowedTags == null || allowedTags.contains (tagName)) &&
 	                                          allTags.contains (tagName) && !insideCodeTag) {
 	                            insideHtmlTag = insideTag = true;
 	                            htmlQuoteChar = '\u0000';
 	                            // set ignoreNewline on some tags, depending on wheather they're
 	                            // being opened or closed.
 	                            // what's going on here? we switch newline encoding on inside some tags, for
 	                            // others we switch it on when they're closed
 	                            linebreaks = Math.max(linebreaks-swallowLinebreaks, 0);
 	                            if (swallowAll.contains (tagName)) {
 	                               swallowLinebreaks = 1000;
 	                            } else if (swallowTwo.contains (tagName)) {
 	                               swallowLinebreaks = 2;
 	                            } else if (swallowOne.contains (tagName)) {
 	                               swallowLinebreaks = 1;
 	                            } else {
 	                               swallowLinebreaks = 0;
 	                            }
 	                            if (insideCloseTag) {
 	                                int t = openTags.search (tagName);
 	                                if (t == -1) {
 	                                    i = j;
 	                                    insideHtmlTag = insideTag = false;
 	                                    continue;
 	                                } else if (t > 1) {
 	                                    for (int k=1; k<t; k++) {
 	                                        ret.append ("</");
 	                                        ret.append (openTags.pop());
 	                                        ret.append (">");
 	                                    }
 	                                }
 	                                openTags.pop ();
 	                            } else {
 	                                openTags.push (tagName);
 	                                swallowLinebreaks = Math.max (swallowLinebreaks-1, 0);
 	                            }
 	                            if ("code".equals (tagName) && !insideCloseTag)
 	                                insideCodeTag = true;
 	                            if ("pre".equals (tagName))
 	                                insidePreTag = !insideCloseTag;
 	                        }
 	                    }
 	                }
 	            } // if (i < l-2)
 	    }
 	    if (linebreaks > 0 && !Character.isWhitespace(c)) {
 	       if (!insidePreTag && linebreaks > swallowLinebreaks) {
 	           linebreaks -= swallowLinebreaks;
 	           for (int k=0; k<linebreaks; k++)
 	               ret.append ("\n<br />");
 	       }
 	       if (!insideTag)
 	           swallowLinebreaks = 0;
 	       linebreaks = 0;
 	    }
 	    switch (c) {
 	       case '<':
 	            if (insideTag)
 	                ret.append ('<');
 	            else
 	                ret.append ("&lt;");
 	            break;
 	        case '&':
 	            // check if this is an HTML entity already,
 	            // in which case we pass it though unchanged
@ -365,56 +492,6 @@ public final class HtmlEncoder {
 	            // we didn't reach a break, so encode the ampersand as HTML entity
 	            ret.append ("&amp;");
 	            break;
 	        case '<':
 	            if (i < l-2) {
 	                if (!insideMacroTag && '%' == str.charAt(i+1)) {
 	                    // this is the beginning of a Helma macro tag
 	                    if (!insideCodeTag) {
 	                        insideMacroTag = insideTag = true;
 	                        macroQuoteChar = '\u0000';
 	                    }
 	                } else if ('!' == str.charAt(i+1) && '-' == str.charAt(i+2)) {
 	                    // the beginning of an HTML comment?
 	                    if (!insideCodeTag)
 	                        insideComment = insideTag = (i<l-3 && '-' == str.charAt(i+3));
 	                } else if (!insideTag) {
 	                    // check if this is a HTML tag.
 	                    boolean insideCloseTag = ('/' == str.charAt(i+1));
 	                    int tagStart = insideCloseTag ? i+2 : i+1;
 	                    int j = tagStart;
 	                    while (j<l && Character.isLetterOrDigit (str.charAt(j)))
 	                        j++;
 	                    if (j > tagStart && j < l) {
 	                        String tagName = str.substring (tagStart, j).toLowerCase();
 	                        if ("code".equals (tagName) && insideCloseTag && insideCodeTag)
 	                            insideCodeTag = false;
 	                        if (allTags.contains (tagName) && !insideCodeTag) {
 	                            insideHtmlTag = insideTag = true;
 	                            htmlQuoteChar = '\u0000';
 	                            // set ignoreNewline on some tags, depending on wheather they're
 	                            // being opened or closed.
 	                            // what's going on here? we switch newline encoding on inside some tags, for
 	                            // others we switch it on when they're closed
 	                            if (encodeLinebreakTags.contains (tagName)) {
 	                                ignoreNewline = insideCloseTag;
 	                                swallowOneNewline = true;
 	                            } else if (suppressLinebreakTags.contains (tagName)) {
 	                                ignoreNewline = !insideCloseTag;
 	                                swallowOneNewline = true;
 	                            } else if ("p".equals (tagName) || "blockquote".equals (tagName) || "bq".equals (tagName)) {
 	                                swallowOneNewline = true;
 	                            }
 	                            if ("code".equals (tagName) && !insideCloseTag)
 	                                insideCodeTag = true;
 	                        }
 	                    }
 	                }
 	            } // if (i < l-2)
 	            if (insideTag)
 	                ret.append ('<');
 	            else
 	                ret.append ("&lt;");
 	            break;
 	        case '\\':
 	            ret.append (c);
 	            if (insideTag && !insideComment)
@ -444,11 +521,8 @@ public final class HtmlEncoder {
 	            break;
 	        case  '\n':
 	            ret.append ('\n');
-	            if (!insideTag) {
+	            if (!insideTag)
-	                if (!ignoreNewline && !swallowOneNewline)
+	                linebreaks++;
 	                    ret.append ("<br />");
 	                swallowOneNewline = false;
 	            }
 	            break;
 	        case '>':
 	            // For Helma macro tags and comments, we overrule tag balancing,
@ -465,6 +539,11 @@ public final class HtmlEncoder {
 	                // only leave HTML tag if quotation marks are balanced
 	                // within that tag.
 	                insideHtmlTag = htmlQuoteChar != '\u0000';
 	                // Check if this is an empty tag so we don't generate an
 	                // additional </close> tag.
 	                if (str.charAt(i-1) == '/') {
 	                    openTags.pop();
 	                }
 	            } else {
 	                ret.append ("&gt;");
 	            }
@ -482,11 +561,18 @@ public final class HtmlEncoder {
 	                ret.append ((int) c);
 	                ret.append (";");
 	            }
 	            if (swallowOneNewline && !insideTag && !Character.isWhitespace (c))
 	                swallowOneNewline = false;
 	            escape = false;
 	    }
 	}
 	// if tags were opened but not closed, close them.
 	int o = openTags.size();
 	if (o > 0) {
 	    for (int k=0; k<o; k++) {
 	        ret.append ("</");
 	        ret.append (openTags.pop());
 	        ret.append (">");
 	    }
 	}
     }