From 962b2b6e6cb860851a0810af9b41c82d351af813 Mon Sep 17 00:00:00 2001
From: hns <hannesw@gmail.com>
Date: Fri, 21 Jun 2002 13:41:29 +0000
Subject: [PATCH] HTML encoding is now smarter about encoding &, < and >. If
 they are part of a valid HTML entity reference, an HTML tag or a Helma macro
 they are passed through unchanged, otherwise they are encoded to &amp;, &lt;
 or &gt;.

Another new feature that should make writing documentation on
Helma easier is that everything is encoded if it is placed within
a <code> tag.
---
 src/helma/util/HtmlEncoder.java | 288 ++++++++++++++++++++++++--------
 1 file changed, 221 insertions(+), 67 deletions(-)
diff --git a/src/helma/util/HtmlEncoder.java b/src/helma/util/HtmlEncoder.java
index d9b23fa9..bc99301c 100644
--- a/src/helma/util/HtmlEncoder.java
+++ b/src/helma/util/HtmlEncoder.java
@@ -158,94 +158,248 @@ public final class HtmlEncoder {
     };
 
 
+    static final HashSet allTags = new HashSet ();
+    static {
+	allTags.add ("a");
+	allTags.add ("abbr");
+	allTags.add ("address");
+	allTags.add ("applet");
+	allTags.add ("area");
+	allTags.add ("b");
+	allTags.add ("base");
+	allTags.add ("basefont");
+	allTags.add ("bgsound");
+	allTags.add ("big");
+	allTags.add ("blink");
+	allTags.add ("blockquote");
+	allTags.add ("bq");
+	allTags.add ("body");
+	allTags.add ("br");
+	allTags.add ("button");
+	allTags.add ("caption");
+	allTags.add ("center");
+	allTags.add ("cite");
+	allTags.add ("code");
+	allTags.add ("col");
+	allTags.add ("colgroup");
+	allTags.add ("del");
+	allTags.add ("dir");
+	allTags.add ("div");
+	allTags.add ("dl");
+	allTags.add ("dt");
+	allTags.add ("dd");
+	allTags.add ("em");
+	allTags.add ("embed");
+	allTags.add ("fieldset");
+	allTags.add ("font");
+	allTags.add ("form");
+	allTags.add ("frame");
+	allTags.add ("frameset");
+	allTags.add ("h1");
+	allTags.add ("h2");
+	allTags.add ("h3");
+	allTags.add ("h4");
+	allTags.add ("h5");
+	allTags.add ("h6");
+	allTags.add ("head");
+	allTags.add ("html");
+	allTags.add ("i");
+	allTags.add ("iframe");
+	allTags.add ("img");
+	allTags.add ("input");
+	allTags.add ("ins");
+	allTags.add ("isindex");
+	allTags.add ("kbd");
+	allTags.add ("li");
+	allTags.add ("link");
+	allTags.add ("listing");
+	allTags.add ("map");
+	allTags.add ("marquee");
+	allTags.add ("menu");
+	allTags.add ("meta");
+	allTags.add ("nobr");
+	allTags.add ("noframes");
+	allTags.add ("object");
+	allTags.add ("ol");
+	allTags.add ("option");
+	allTags.add ("optgroup");
+	allTags.add ("p");
+	allTags.add ("param");
+	allTags.add ("plaintext");
+	allTags.add ("pre");
+	allTags.add ("q");
+	allTags.add ("samp");
+	allTags.add ("script");
+	allTags.add ("select");
+	allTags.add ("small");
+	allTags.add ("span");
+	allTags.add ("strike");
+	allTags.add ("strong");
+	allTags.add ("style");
+	allTags.add ("sub");
+	allTags.add ("sup");
+	allTags.add ("table");
+	allTags.add ("tbody");
+	allTags.add ("td");
+	allTags.add ("textarea");
+	allTags.add ("tfoot");
+	allTags.add ("th");
+	allTags.add ("thead");
+	allTags.add ("title");
+	allTags.add ("tr");
+	allTags.add ("tt");
+	allTags.add ("u");
+	allTags.add ("ul");
+	allTags.add ("var");
+	allTags.add ("wbr");
+	allTags.add ("xmp");
+	allTags.add ("%");
+    }
+
+    // tags which signal us to start suppressing \n -> <br> encoding
+    // these are "structrural" tags, for example, we don't want to add <br>s 
+    // between a <table> and a <tr>.
+    static final HashSet suppressLinebreakTags = new HashSet ();
+    static {
+	suppressLinebreakTags.add ("table");
+	suppressLinebreakTags.add ("ul");
+	suppressLinebreakTags.add ("ol");
+	suppressLinebreakTags.add ("pre");
+    }
+
+    // tags which signal us to stop suppressing \n -> <br> encoding
+    // these usually signal transition from structural tags to normal
+    // HTML text, e.g. <td>
+    static final HashSet encodeLinebreakTags = new HashSet ();
+    static {
+	encodeLinebreakTags.add ("td");
+	encodeLinebreakTags.add ("th");
+	encodeLinebreakTags.add ("li");
+    }
+
     /**
-     * 
-     */ 
+     *
+     */
     public final static String encode (String str) {
 	// try to make stringbuffer large enough from the start
 	StringBuffer ret = new StringBuffer (Math.round (str.length()*1.4f));
 	encode (str, ret);
-	return ret.toString(); 
+	return ret.toString();
     }
-    
+
     /**
-     *  
-     */ 
+     *
+     */
     public final static void encode (String str, StringBuffer ret) {
 	if  (str == null)
 	    return;
-	
-	int l = str.length();
-	
-	boolean closeTag=false, readTag=false, tagOpen=false;
-	// the difference between swallowOneNewline and ignoreNewline is that swallowOneNewline is just effective once (for the next newline)
+
+	char[] chars = str.toCharArray ();
+	int l = chars.length;
+
+	// are we currently within a < and a >?
+	boolean insideTag=false;
+	// if we are inside a <code> tag, we encode everything to make 
+	// documentation work easier
+	boolean insideCodeTag = false;
+	// the difference between swallowOneNewline and ignoreNewline is that
+	// swallowOneNewline is just effective once (for the next newline)
 	boolean ignoreNewline = false;
 	boolean swallowOneNewline = false;
-	StringBuffer tag = new StringBuffer ();
-	
+
 	for (int i=0; i<l; i++) {
-	    char c = str.charAt (i);
-	    if (readTag) {
-	        if (Character.isLetterOrDigit (c))
-	            tag.append (c);
-	        else if ('/' == c)
-	            closeTag = true;
-	        else {
-	            String t = tag.toString ();
-	            // set ignoreNewline on some tags, depending on wheather they're
-	            // being opened or closed.
-	            // what's going on here? we switch newline encoding on inside some tags, for
-	            // others we switch it on when they're closed
-	            if ("td".equalsIgnoreCase (t) || "th".equalsIgnoreCase (t) || "li".equalsIgnoreCase (t)) {
-	                ignoreNewline = closeTag;
-	                swallowOneNewline = true;
-	            } else if ("table".equalsIgnoreCase (t) || "ul".equalsIgnoreCase (t) || "ol".equalsIgnoreCase (t) || "pre".equalsIgnoreCase (t)) {
-	                ignoreNewline = !closeTag;
-	                swallowOneNewline = true;
-	            } else if ("p".equalsIgnoreCase (t)) {
-	                swallowOneNewline = true;
-	            }
-	
-	            readTag = false;
-	            closeTag = false;
-	            tag.setLength (0);
-	        }
-	    } // if (readTag)
+	    char c = chars[i];
 
 	    switch (c) {
-	        // case '&':
-                      //    ret.append ("&amp;");
-	        //    break;
-	        case  '\n':
-	            ret.append ('\n');
-                         if (!ignoreNewline && !swallowOneNewline)
-	                ret.append ("<br />");
-	            if (!tagOpen)
-	                swallowOneNewline = false;
+	        case '&':
+	            // check if this is an HTML entity already, in which case we pass it though unchanged
+	            if (i < l-4 && !insideCodeTag) {
+	                // is this a numeric entity?
+	                if (chars[i+1] == '#' ) {
+	                   int j = i+2;
+	                   while (j<l && Character.isDigit (chars[j]))
+	                       j++;
+	                   if (j<l && chars[j] == ';') {
+	                       ret.append ("&");
+	                       break;
+	                   }
+	                } else {
+	                   int j = i+1;
+	                   while (j<l && Character.isLetterOrDigit (chars[j]))
+	                       j++;
+	                   if (j<l && chars[j] == ';') {
+	                       ret.append ("&");
+	                       break;
+	                   }
+	                }
+	            }
+	            // we didn't reach a break, so encode the ampersand as HTML entity
+	            ret.append ("&amp;");
 	            break;
 	        case '<':
-	            closeTag = false;
-	            readTag = true;
-	            tagOpen = true;
-	            ret.append ('<');
+	            if (i < l-2) {
+	                boolean insideCloseTag = ('/' == chars[i+1]);
+	                int tagStart = insideCloseTag ? i+2 : i+1;
+	                int j = tagStart;
+	                while (j<l && (Character.isLetterOrDigit (chars[j]) || chars[j] == '%'))
+	                    j++;
+	                if (j > tagStart && j < l) {
+	                    String tagName = new String (chars, tagStart, j-tagStart).toLowerCase ();
+	                    if ("code".equals (tagName) && insideCloseTag && insideCodeTag)
+	                        insideCodeTag = false;
+	                    if (allTags.contains (tagName) && !insideCodeTag) {
+	                        insideTag = true;
+	                        ret.append ('<');
+	                        // set ignoreNewline on some tags, depending on wheather they're
+	                        // being opened or closed.
+	                        // what's going on here? we switch newline encoding on inside some tags, for
+	                        // others we switch it on when they're closed
+	                        if (encodeLinebreakTags.contains (tagName)) {
+	                            ignoreNewline = insideCloseTag;
+	                            swallowOneNewline = true;
+	                        } else if (suppressLinebreakTags.contains (tagName)) {
+	                            ignoreNewline = !insideCloseTag;
+	                            swallowOneNewline = true;
+	                        } else if ("p".equalsIgnoreCase (tagName) || 
+	                                     "blockquote".equalsIgnoreCase (tagName) ||
+	                                     "bq".equalsIgnoreCase (tagName)) {
+	                            swallowOneNewline = true;
+	                        }
+	                        if ("code".equals (tagName) && !insideCloseTag)
+	                            insideCodeTag = true;
+	                        break;
+	                    }
+	                }
+	            } // if (i < l-2)
+	            ret.append ("&lt;");
+	            break;
+	        case  '\n':
+	            ret.append ('\n');
+	            if (!insideTag && !ignoreNewline && !swallowOneNewline)
+	                ret.append ("<br />");
+	            if (!insideTag)
+	                swallowOneNewline = false;
 	            break;
 	        case '>':
-	            tagOpen = false;
-	            ret.append ('>');
+	            if (insideTag)
+	                ret.append ('>');
+	            else
+	                ret.append ("&gt;");
+	            insideTag = false;
 	            break;
 	        default:
-	             // ret.append (c);
-	             if (c < 128)
-	                 ret.append (c);
-	             else if (c >= 128 && c < 256)
-	                 ret.append (transform[c-128]);
-	             else { 
-	                 ret.append ("&#");
-	                 ret.append ((int) c);
-	                 ret.append (";");
-	             }
-	             if (!tagOpen && !Character.isWhitespace (c))
-	                 swallowOneNewline = false;
+	            // ret.append (c);
+	            if (c < 128)
+	                ret.append (c);
+	            else if (c >= 128 && c < 256)
+	                ret.append (transform[c-128]);
+	            else {
+	                ret.append ("&#");
+	                ret.append ((int) c);
+	                ret.append (";");
+	            }
+	            if (!insideTag && !Character.isWhitespace (c))
+	                swallowOneNewline = false;
 	    }
 	}
      }