Implemented pretty good heuristics to close tags when they

ought to be closed.
This commit is contained in:
hns 2002-06-04 16:14:56 +00:00
parent 15d7342ea4
commit 850c07de80

View file

@ -6,6 +6,9 @@ package helma.util;
import java.util.HashMap; import java.util.HashMap;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashSet;
import java.util.Stack;
import java.util.EmptyStackException;
import java.io.IOException; import java.io.IOException;
import javax.swing.text.html.parser.*; import javax.swing.text.html.parser.*;
import javax.swing.text.SimpleAttributeSet; import javax.swing.text.SimpleAttributeSet;
@ -18,8 +21,26 @@ public class HtmlParser extends Parser {
HTMLBuilder builder; HTMLBuilder builder;
Attributes attributes = new Attributes (); Attributes attributes = new Attributes ();
Stack stack = new Stack ();
static final HashSet stopNone = new HashSet ();
static final HashSet stopTable = new HashSet ();
static final HashSet stopList = new HashSet ();
static final HashSet stopDeflist = new HashSet ();
static {
stopTable.add ("TABLE");
stopList.add ("TABLE");
stopList.add ("UL");
stopList.add ("OL");
stopDeflist.add ("TABLE");
stopDeflist.add ("DL");
}
public HtmlParser () throws IOException { public HtmlParser () throws IOException {
super (DTD.getDTD ("html32")); super (DTD.getDTD ("html32"));
// define elements to be treated as container tags, and undefine those
// to be treated as empty tags.
dtd.getElement ("table"); dtd.getElement ("table");
dtd.getElement ("tr"); dtd.getElement ("tr");
dtd.getElement ("td"); dtd.getElement ("td");
@ -34,6 +55,17 @@ public class HtmlParser extends Parser {
dtd.getElement ("ul"); dtd.getElement ("ul");
dtd.getElement ("ol"); dtd.getElement ("ol");
dtd.getElement ("li"); dtd.getElement ("li");
dtd.getElement ("dl");
dtd.getElement ("dt");
dtd.getElement ("dd");
dtd.getElement ("h1");
dtd.getElement ("h2");
dtd.getElement ("h3");
dtd.getElement ("h4");
dtd.getElement ("h5");
dtd.getElement ("h6");
dtd.getElement ("form");
dtd.getElement ("option");
dtd.elementHash.remove ("meta"); dtd.elementHash.remove ("meta");
dtd.elementHash.remove ("link"); dtd.elementHash.remove ("link");
dtd.elementHash.remove ("base"); dtd.elementHash.remove ("base");
@ -52,40 +84,70 @@ public class HtmlParser extends Parser {
// System.err.println ("handleStartTag ("+tag.getHTMLTag()+")"); // System.err.println ("handleStartTag ("+tag.getHTMLTag()+")");
attributes.convert (getAttributes()); attributes.convert (getAttributes());
flushAttributes(); flushAttributes();
String tagname = tag.getHTMLTag().toString().toUpperCase();
// immediately empty A anchor tag
if ("A".equals (tagname) && attributes.getValue ("href") == null) try {
builder.startElement (tagname, attributes);
builder.endElement (tagname);
return;
} catch (SAXException x) {}
if ("TD".equals (tagname)) {
closeOpenTags ("TD", stopTable, 10);
} else if ("TR".equals (tagname)) {
closeOpenTags ("TR", stopTable, 10);
} else if ("LI".equals (tagname)) {
closeOpenTags ("LI", stopList, 6);
} else if ("DT".equals (tagname) || "DD".equals (tagname)) {
closeOpenTags ("DT", stopDeflist, 6);
closeOpenTags ("DL", stopDeflist, 6);
} else if ("OPTION".equals (tagname)) {
closeOpenTags ("OPTION", stopNone, 1);
} else if ("P".equals (tagname)) {
closeOpenTags ("P", stopNone, 1);
}
stack.push (tagname);
try { try {
builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes); builder.startElement (tagname, attributes);
} catch (SAXException x) { } catch (SAXException x) {
System.err.println ("Error in handleStartTag"); System.err.println ("Error in handleStartTag");
} }
} }
/**
* Handle Empty Tag.
*/
protected void handleEmptyTag(TagElement tag) {
// System.err.println ("handleEmptyTag ("+tag.getHTMLTag()+")");
attributes.convert (getAttributes());
flushAttributes();
try {
builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes);
builder.endElement (tag.getHTMLTag().toString().toUpperCase());
} catch (SAXException x) {
System.err.println ("Error in handleEmptyTag: "+x);
}
}
/** /**
* Handle End Tag. * Handle End Tag.
*/ */
protected void handleEndTag(TagElement tag) { protected void handleEndTag(TagElement tag) {
// System.err.println ("handleEndTag ("+tag.getHTMLTag()+")"); // System.err.println ("handleEndTag ("+tag.getHTMLTag()+")");
String tagname = tag.getHTMLTag().toString().toUpperCase();
try { try {
builder.endElement (tag.getHTMLTag().toString().toUpperCase()); if (tagname.equals (stack.peek ()))
stack.pop ();
} catch (EmptyStackException es) {}
try {
builder.endElement (tagname);
} catch (SAXException x) { } catch (SAXException x) {
System.err.println ("Error in handleEndTag: "+x); System.err.println ("Error in handleEndTag: "+x);
} }
} }
/**
* Handle Empty Tag.
*/
protected void handleEmptyTag(TagElement tag) {
// System.err.println ("handleEmptyTag ("+tag.getHTMLTag()+")");
attributes.convert (getAttributes());
flushAttributes();
String tagname = tag.getHTMLTag().toString().toUpperCase();
try {
builder.startElement (tagname, attributes);
builder.endElement (tagname);
} catch (SAXException x) {
System.err.println ("Error in handleEmptyTag: "+x);
}
}
/** /**
* Handle Text. * Handle Text.
*/ */
@ -108,7 +170,13 @@ public class HtmlParser extends Parser {
/** /**
* Handle comment. * Handle comment.
*/ */
protected void handleComment(char text[]) { protected void handleComment(char data[]) {
// System.err.println ("handleComment ("+new String (data)+")");
/* try {
builder.characters (data, 0, data.length);
} catch (SAXException x) {
System.err.println ("Error in handleComment");
}*/
} }
public HTMLDocument getDocument () { public HTMLDocument getDocument () {
@ -118,6 +186,29 @@ public class HtmlParser extends Parser {
return builder.getHTMLDocument (); return builder.getHTMLDocument ();
} }
private void closeOpenTags (String until, HashSet stoppers, int maxdepth) {
int l = stack.size();
int stop = Math.max (0, l-maxdepth);
int found = -1;
for (int i=l-1; i>=stop; i--) {
Object o = stack.elementAt (i);
if (stoppers.contains (o))
return;
if (until.equals (o)) {
found = i;
break;
}
}
if (found > -1) {
for (int i=l-1; i>=found; i--) {
try {
String t = (String) stack.pop ();
builder.endElement (t);
} catch (Exception x) {}
}
}
}
class Attributes implements org.xml.sax.AttributeList { class Attributes implements org.xml.sax.AttributeList {
HashMap map = new HashMap(); HashMap map = new HashMap();
@ -155,8 +246,9 @@ public class HtmlParser extends Parser {
for (Enumeration e = attset.getAttributeNames(); e.hasMoreElements(); ) { for (Enumeration e = attset.getAttributeNames(); e.hasMoreElements(); ) {
Object name = e.nextElement (); Object name = e.nextElement ();
Object value = attset.getAttribute (name).toString(); Object value = attset.getAttribute (name).toString();
map.put (name.toString(), value); name = name.toString().toLowerCase ();
names.add (name.toString()); map.put (name, value);
names.add (name);
values.add (value); values.add (value);
} }
} }