Enhancements in Helma format() functions:
* implement tag balancing: - unclosed tags are closed when necessary (either when an enclosing tag is closed or when all text is processed and tags are still open) - closed tags that were not opened are dropped * be smart in newline-to-<br> conversion: when an HTML block element is met, some or all newline characters are excluded from conversion. * implemented "allowed tags" feature, all other tags are escaped. This is not yet exported to the JavaScript API, though.
This commit is contained in:
parent
7763b6ac51
commit
f8123e7934
1 changed files with 166 additions and 80 deletions
|
@ -263,29 +263,58 @@ public final class HtmlEncoder {
|
||||||
allTags.add ("xmp");
|
allTags.add ("xmp");
|
||||||
}
|
}
|
||||||
|
|
||||||
// tags which signal us to start suppressing \n -> <br> encoding
|
|
||||||
// these are "structrural" tags, for example, we don't want to add <br>s
|
// HTML block tags need to suppress automatic newline to <br>
|
||||||
// between a <table> and a <tr>.
|
// conversion around them to look good. However, they differ
|
||||||
static final HashSet suppressLinebreakTags = new HashSet ();
|
// in how many newlines around them should ignored. These sets
|
||||||
|
// help to treat each tag right in newline conversion.
|
||||||
|
static final HashSet swallowAll = new HashSet ();
|
||||||
|
static final HashSet swallowTwo = new HashSet ();
|
||||||
|
static final HashSet swallowOne = new HashSet ();
|
||||||
static {
|
static {
|
||||||
suppressLinebreakTags.add ("table");
|
// actual block level elements
|
||||||
suppressLinebreakTags.add ("ul");
|
swallowOne.add ("address");
|
||||||
suppressLinebreakTags.add ("ol");
|
swallowTwo.add ("blockquote");
|
||||||
suppressLinebreakTags.add ("pre");
|
swallowTwo.add ("center");
|
||||||
|
swallowOne.add ("dir");
|
||||||
|
swallowOne.add ("div");
|
||||||
|
swallowTwo.add ("dl");
|
||||||
|
swallowTwo.add ("fieldset");
|
||||||
|
swallowTwo.add ("form");
|
||||||
|
swallowTwo.add ("h1");
|
||||||
|
swallowTwo.add ("h2");
|
||||||
|
swallowTwo.add ("h3");
|
||||||
|
swallowTwo.add ("h4");
|
||||||
|
swallowTwo.add ("h5");
|
||||||
|
swallowTwo.add ("h6");
|
||||||
|
swallowTwo.add ("hr");
|
||||||
|
swallowTwo.add ("isindex");
|
||||||
|
swallowAll.add ("menu");
|
||||||
|
swallowAll.add ("noframes");
|
||||||
|
swallowAll.add ("noscript");
|
||||||
|
swallowTwo.add ("ol");
|
||||||
|
swallowTwo.add ("p");
|
||||||
|
swallowTwo.add ("pre");
|
||||||
|
swallowOne.add ("table");
|
||||||
|
swallowTwo.add ("ul");
|
||||||
|
/// to be treated as block level elements
|
||||||
|
swallowTwo.add ("dd");
|
||||||
|
swallowTwo.add ("dt");
|
||||||
|
swallowTwo.add ("frameset");
|
||||||
|
swallowTwo.add ("li");
|
||||||
|
swallowAll.add ("tbody");
|
||||||
|
swallowTwo.add ("td");
|
||||||
|
swallowAll.add ("tfoot");
|
||||||
|
swallowOne.add ("th");
|
||||||
|
swallowAll.add ("thead");
|
||||||
|
swallowAll.add ("tr");
|
||||||
}
|
}
|
||||||
|
|
||||||
// tags which signal us to stop suppressing \n -> <br> encoding
|
|
||||||
// these usually signal transition from structural tags to normal
|
|
||||||
// HTML text, e.g. <td>
|
|
||||||
static final HashSet encodeLinebreakTags = new HashSet ();
|
|
||||||
static {
|
|
||||||
encodeLinebreakTags.add ("td");
|
|
||||||
encodeLinebreakTags.add ("th");
|
|
||||||
encodeLinebreakTags.add ("li");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Do "smart" encodging on a string. This means that valid HTML entities and tags,
|
||||||
|
* Helma macros and HTML comments are passed through unescaped, while
|
||||||
|
* other occurrences of '<', '>' and '&' are encoded to HTML entities.
|
||||||
*/
|
*/
|
||||||
public final static String encode (String str) {
|
public final static String encode (String str) {
|
||||||
if (str == null)
|
if (str == null)
|
||||||
|
@ -295,7 +324,7 @@ public final class HtmlEncoder {
|
||||||
return "";
|
return "";
|
||||||
// try to make stringbuffer large enough from the start
|
// try to make stringbuffer large enough from the start
|
||||||
StringBuffer ret = new StringBuffer (Math.round (l*1.4f));
|
StringBuffer ret = new StringBuffer (Math.round (l*1.4f));
|
||||||
encode (str, ret);
|
encode (str, ret, null);
|
||||||
return ret.toString();
|
return ret.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -305,11 +334,21 @@ public final class HtmlEncoder {
|
||||||
* other occurrences of '<', '>' and '&' are encoded to HTML entities.
|
* other occurrences of '<', '>' and '&' are encoded to HTML entities.
|
||||||
*/
|
*/
|
||||||
public final static void encode (String str, StringBuffer ret) {
|
public final static void encode (String str, StringBuffer ret) {
|
||||||
|
encode (str, ret, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Do "smart" encodging on a string. This means that valid HTML entities and tags,
|
||||||
|
* Helma macros and HTML comments are passed through unescaped, while
|
||||||
|
* other occurrences of '<', '>' and '&' are encoded to HTML entities.
|
||||||
|
*/
|
||||||
|
public final static void encode (String str, StringBuffer ret, Set allowedTags) {
|
||||||
if (str == null)
|
if (str == null)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
int l = str.length();
|
int l = str.length();
|
||||||
|
|
||||||
|
Stack openTags = new Stack();
|
||||||
// are we currently within a < and a > that consitute some kind of tag?
|
// are we currently within a < and a > that consitute some kind of tag?
|
||||||
// we use tag balancing to know whether we are inside a tag (and should
|
// we use tag balancing to know whether we are inside a tag (and should
|
||||||
// pass things through unchanged) or outside (and should encode stuff).
|
// pass things through unchanged) or outside (and should encode stuff).
|
||||||
|
@ -319,6 +358,7 @@ public final class HtmlEncoder {
|
||||||
// if we are inside a <code> tag, we encode everything to make
|
// if we are inside a <code> tag, we encode everything to make
|
||||||
// documentation work easier
|
// documentation work easier
|
||||||
boolean insideCodeTag = false;
|
boolean insideCodeTag = false;
|
||||||
|
boolean insidePreTag = false;
|
||||||
// are we within a Helma <% macro %> tag? We treat macro tags and
|
// are we within a Helma <% macro %> tag? We treat macro tags and
|
||||||
// comments specially, since we can't rely on tag balancing
|
// comments specially, since we can't rely on tag balancing
|
||||||
// to know when we leave a macro tag or comment.
|
// to know when we leave a macro tag or comment.
|
||||||
|
@ -328,17 +368,104 @@ public final class HtmlEncoder {
|
||||||
// the quotation mark we are in within an HTML or Macro tag, if any
|
// the quotation mark we are in within an HTML or Macro tag, if any
|
||||||
char htmlQuoteChar = '\u0000';
|
char htmlQuoteChar = '\u0000';
|
||||||
char macroQuoteChar = '\u0000';
|
char macroQuoteChar = '\u0000';
|
||||||
// the difference between swallowOneNewline and ignoreNewline is that
|
// number of newlines to ignore in \n -> <br> conversion
|
||||||
// swallowOneNewline is just effective once (for the next newline)
|
int swallowLinebreaks = 0;
|
||||||
boolean ignoreNewline = false;
|
// number of newlines met since the last non-whitespace character
|
||||||
boolean swallowOneNewline = false;
|
int linebreaks = 0;
|
||||||
// did we meet a backslash escape?
|
// did we meet a backslash escape?
|
||||||
boolean escape = false;
|
boolean escape = false;
|
||||||
|
|
||||||
for (int i=0; i<l; i++) {
|
for (int i=0; i<l; i++) {
|
||||||
char c = str.charAt(i);
|
char c = str.charAt(i);
|
||||||
|
|
||||||
|
// step one: check if this is the beginning of an HTML tag, comment or
|
||||||
|
// Helma macro.
|
||||||
|
if (c == '<') {
|
||||||
|
if (i < l-2) {
|
||||||
|
if (!insideMacroTag && '%' == str.charAt(i+1)) {
|
||||||
|
// this is the beginning of a Helma macro tag
|
||||||
|
if (!insideCodeTag) {
|
||||||
|
insideMacroTag = insideTag = true;
|
||||||
|
macroQuoteChar = '\u0000';
|
||||||
|
}
|
||||||
|
} else if ('!' == str.charAt(i+1) && '-' == str.charAt(i+2)) {
|
||||||
|
// the beginning of an HTML comment?
|
||||||
|
if (!insideCodeTag)
|
||||||
|
insideComment = insideTag = (i<l-3 && '-' == str.charAt(i+3));
|
||||||
|
} else if (!insideTag) {
|
||||||
|
// check if this is a HTML tag.
|
||||||
|
boolean insideCloseTag = ('/' == str.charAt(i+1));
|
||||||
|
int tagStart = insideCloseTag ? i+2 : i+1;
|
||||||
|
int j = tagStart;
|
||||||
|
while (j<l && Character.isLetterOrDigit (str.charAt(j)))
|
||||||
|
j++;
|
||||||
|
if (j > tagStart && j < l) {
|
||||||
|
String tagName = str.substring (tagStart, j).toLowerCase();
|
||||||
|
if ("code".equals (tagName) && insideCloseTag && insideCodeTag)
|
||||||
|
insideCodeTag = false;
|
||||||
|
if ((allowedTags == null || allowedTags.contains (tagName)) &&
|
||||||
|
allTags.contains (tagName) && !insideCodeTag) {
|
||||||
|
insideHtmlTag = insideTag = true;
|
||||||
|
htmlQuoteChar = '\u0000';
|
||||||
|
// set ignoreNewline on some tags, depending on wheather they're
|
||||||
|
// being opened or closed.
|
||||||
|
// what's going on here? we switch newline encoding on inside some tags, for
|
||||||
|
// others we switch it on when they're closed
|
||||||
|
linebreaks = Math.max(linebreaks-swallowLinebreaks, 0);
|
||||||
|
if (swallowAll.contains (tagName)) {
|
||||||
|
swallowLinebreaks = 1000;
|
||||||
|
} else if (swallowTwo.contains (tagName)) {
|
||||||
|
swallowLinebreaks = 2;
|
||||||
|
} else if (swallowOne.contains (tagName)) {
|
||||||
|
swallowLinebreaks = 1;
|
||||||
|
} else {
|
||||||
|
swallowLinebreaks = 0;
|
||||||
|
}
|
||||||
|
if (insideCloseTag) {
|
||||||
|
int t = openTags.search (tagName);
|
||||||
|
if (t == -1) {
|
||||||
|
i = j;
|
||||||
|
insideHtmlTag = insideTag = false;
|
||||||
|
continue;
|
||||||
|
} else if (t > 1) {
|
||||||
|
for (int k=1; k<t; k++) {
|
||||||
|
ret.append ("</");
|
||||||
|
ret.append (openTags.pop());
|
||||||
|
ret.append (">");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
openTags.pop ();
|
||||||
|
} else {
|
||||||
|
openTags.push (tagName);
|
||||||
|
swallowLinebreaks = Math.max (swallowLinebreaks-1, 0);
|
||||||
|
}
|
||||||
|
if ("code".equals (tagName) && !insideCloseTag)
|
||||||
|
insideCodeTag = true;
|
||||||
|
if ("pre".equals (tagName))
|
||||||
|
insidePreTag = !insideCloseTag;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // if (i < l-2)
|
||||||
|
}
|
||||||
|
if (linebreaks > 0 && !Character.isWhitespace(c)) {
|
||||||
|
if (!insidePreTag && linebreaks > swallowLinebreaks) {
|
||||||
|
linebreaks -= swallowLinebreaks;
|
||||||
|
for (int k=0; k<linebreaks; k++)
|
||||||
|
ret.append ("\n<br />");
|
||||||
|
}
|
||||||
|
if (!insideTag)
|
||||||
|
swallowLinebreaks = 0;
|
||||||
|
linebreaks = 0;
|
||||||
|
}
|
||||||
|
|
||||||
switch (c) {
|
switch (c) {
|
||||||
|
case '<':
|
||||||
|
if (insideTag)
|
||||||
|
ret.append ('<');
|
||||||
|
else
|
||||||
|
ret.append ("<");
|
||||||
|
break;
|
||||||
case '&':
|
case '&':
|
||||||
// check if this is an HTML entity already,
|
// check if this is an HTML entity already,
|
||||||
// in which case we pass it though unchanged
|
// in which case we pass it though unchanged
|
||||||
|
@ -365,56 +492,6 @@ public final class HtmlEncoder {
|
||||||
// we didn't reach a break, so encode the ampersand as HTML entity
|
// we didn't reach a break, so encode the ampersand as HTML entity
|
||||||
ret.append ("&");
|
ret.append ("&");
|
||||||
break;
|
break;
|
||||||
case '<':
|
|
||||||
if (i < l-2) {
|
|
||||||
if (!insideMacroTag && '%' == str.charAt(i+1)) {
|
|
||||||
// this is the beginning of a Helma macro tag
|
|
||||||
if (!insideCodeTag) {
|
|
||||||
insideMacroTag = insideTag = true;
|
|
||||||
macroQuoteChar = '\u0000';
|
|
||||||
}
|
|
||||||
} else if ('!' == str.charAt(i+1) && '-' == str.charAt(i+2)) {
|
|
||||||
// the beginning of an HTML comment?
|
|
||||||
if (!insideCodeTag)
|
|
||||||
insideComment = insideTag = (i<l-3 && '-' == str.charAt(i+3));
|
|
||||||
} else if (!insideTag) {
|
|
||||||
// check if this is a HTML tag.
|
|
||||||
boolean insideCloseTag = ('/' == str.charAt(i+1));
|
|
||||||
int tagStart = insideCloseTag ? i+2 : i+1;
|
|
||||||
int j = tagStart;
|
|
||||||
while (j<l && Character.isLetterOrDigit (str.charAt(j)))
|
|
||||||
j++;
|
|
||||||
if (j > tagStart && j < l) {
|
|
||||||
String tagName = str.substring (tagStart, j).toLowerCase();
|
|
||||||
if ("code".equals (tagName) && insideCloseTag && insideCodeTag)
|
|
||||||
insideCodeTag = false;
|
|
||||||
if (allTags.contains (tagName) && !insideCodeTag) {
|
|
||||||
insideHtmlTag = insideTag = true;
|
|
||||||
htmlQuoteChar = '\u0000';
|
|
||||||
// set ignoreNewline on some tags, depending on wheather they're
|
|
||||||
// being opened or closed.
|
|
||||||
// what's going on here? we switch newline encoding on inside some tags, for
|
|
||||||
// others we switch it on when they're closed
|
|
||||||
if (encodeLinebreakTags.contains (tagName)) {
|
|
||||||
ignoreNewline = insideCloseTag;
|
|
||||||
swallowOneNewline = true;
|
|
||||||
} else if (suppressLinebreakTags.contains (tagName)) {
|
|
||||||
ignoreNewline = !insideCloseTag;
|
|
||||||
swallowOneNewline = true;
|
|
||||||
} else if ("p".equals (tagName) || "blockquote".equals (tagName) || "bq".equals (tagName)) {
|
|
||||||
swallowOneNewline = true;
|
|
||||||
}
|
|
||||||
if ("code".equals (tagName) && !insideCloseTag)
|
|
||||||
insideCodeTag = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // if (i < l-2)
|
|
||||||
if (insideTag)
|
|
||||||
ret.append ('<');
|
|
||||||
else
|
|
||||||
ret.append ("<");
|
|
||||||
break;
|
|
||||||
case '\\':
|
case '\\':
|
||||||
ret.append (c);
|
ret.append (c);
|
||||||
if (insideTag && !insideComment)
|
if (insideTag && !insideComment)
|
||||||
|
@ -444,11 +521,8 @@ public final class HtmlEncoder {
|
||||||
break;
|
break;
|
||||||
case '\n':
|
case '\n':
|
||||||
ret.append ('\n');
|
ret.append ('\n');
|
||||||
if (!insideTag) {
|
if (!insideTag)
|
||||||
if (!ignoreNewline && !swallowOneNewline)
|
linebreaks++;
|
||||||
ret.append ("<br />");
|
|
||||||
swallowOneNewline = false;
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
case '>':
|
case '>':
|
||||||
// For Helma macro tags and comments, we overrule tag balancing,
|
// For Helma macro tags and comments, we overrule tag balancing,
|
||||||
|
@ -465,6 +539,11 @@ public final class HtmlEncoder {
|
||||||
// only leave HTML tag if quotation marks are balanced
|
// only leave HTML tag if quotation marks are balanced
|
||||||
// within that tag.
|
// within that tag.
|
||||||
insideHtmlTag = htmlQuoteChar != '\u0000';
|
insideHtmlTag = htmlQuoteChar != '\u0000';
|
||||||
|
// Check if this is an empty tag so we don't generate an
|
||||||
|
// additional </close> tag.
|
||||||
|
if (str.charAt(i-1) == '/') {
|
||||||
|
openTags.pop();
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
ret.append (">");
|
ret.append (">");
|
||||||
}
|
}
|
||||||
|
@ -482,11 +561,18 @@ public final class HtmlEncoder {
|
||||||
ret.append ((int) c);
|
ret.append ((int) c);
|
||||||
ret.append (";");
|
ret.append (";");
|
||||||
}
|
}
|
||||||
if (swallowOneNewline && !insideTag && !Character.isWhitespace (c))
|
|
||||||
swallowOneNewline = false;
|
|
||||||
escape = false;
|
escape = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// if tags were opened but not closed, close them.
|
||||||
|
int o = openTags.size();
|
||||||
|
if (o > 0) {
|
||||||
|
for (int k=0; k<o; k++) {
|
||||||
|
ret.append ("</");
|
||||||
|
ret.append (openTags.pop());
|
||||||
|
ret.append (">");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue