diff --git a/src/helma/util/HtmlParser.java b/src/helma/util/HtmlParser.java
deleted file mode 100644
index eeb58e13..00000000
--- a/src/helma/util/HtmlParser.java
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Helma License Notice
- *
- * The contents of this file are subject to the Helma License
- * Version 2.0 (the "License"). You may not use this file except in
- * compliance with the License. A copy of the License is available at
- * http://adele.helma.org/download/helma/license.txt
- *
- * Copyright 1998-2003 Helma Software. All Rights Reserved.
- *
- * $RCSfile$
- * $Author$
- * $Revision$
- * $Date$
- */
-
-package helma.util;
-
-import org.apache.html.dom.*;
-import org.w3c.dom.html.HTMLDocument;
-import org.xml.sax.SAXException;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.EmptyStackException;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Stack;
-import javax.swing.text.SimpleAttributeSet;
-import javax.swing.text.html.parser.*;
-
-/**
- *
- */
-public class HtmlParser extends Parser {
- static final HashSet stopNone = new HashSet();
- static final HashSet stopTable = new HashSet();
- static final HashSet stopList = new HashSet();
- static final HashSet stopDeflist = new HashSet();
-
- static {
- stopTable.add("TABLE");
- stopList.add("TABLE");
- stopList.add("UL");
- stopList.add("OL");
- stopDeflist.add("TABLE");
- stopDeflist.add("DL");
- }
-
- HTMLBuilder builder;
- Attributes attributes = new Attributes();
- Stack stack = new Stack();
-
- /**
- * Creates a new HtmlParser object.
- *
- * @throws IOException ...
- */
- public HtmlParser() throws IOException {
- super(DTD.getDTD("html32"));
-
- // define elements to be treated as container tags, and undefine those
- // to be treated as empty tags.
- dtd.getElement("table");
- dtd.getElement("tr");
- dtd.getElement("td");
- dtd.getElement("span");
- dtd.getElement("div");
- dtd.getElement("font");
- dtd.getElement("b");
- dtd.getElement("i");
- dtd.getElement("a");
- dtd.getElement("blockquote");
- dtd.getElement("em");
- dtd.getElement("ul");
- dtd.getElement("ol");
- dtd.getElement("li");
- dtd.getElement("dl");
- dtd.getElement("dt");
- dtd.getElement("dd");
- dtd.getElement("h1");
- dtd.getElement("h2");
- dtd.getElement("h3");
- dtd.getElement("h4");
- dtd.getElement("h5");
- dtd.getElement("h6");
- dtd.getElement("form");
- dtd.getElement("option");
- dtd.elementHash.remove("meta");
- dtd.elementHash.remove("link");
- dtd.elementHash.remove("base");
- builder = new HTMLBuilder();
-
- try {
- builder.startDocument();
- } catch (SAXException x) {
- System.err.println("Error in constructor");
- }
- }
-
- /**
- * Handle Start Tag.
- */
- protected void handleStartTag(TagElement tag) {
- // System.err.println ("handleStartTag ("+tag.getHTMLTag()+")");
- attributes.convert(getAttributes());
- flushAttributes();
-
- String tagname = tag.getHTMLTag().toString().toUpperCase();
-
- // immediately empty A anchor tag
- if ("A".equals(tagname) && (attributes.getValue("href") == null)) {
- try {
- builder.startElement(tagname, attributes);
- builder.endElement(tagname);
-
- return;
- } catch (SAXException x) {
- }
- }
-
- if ("TD".equals(tagname)) {
- closeOpenTags("TD", stopTable, 10);
- } else if ("TR".equals(tagname)) {
- closeOpenTags("TR", stopTable, 10);
- } else if ("LI".equals(tagname)) {
- closeOpenTags("LI", stopList, 6);
- } else if ("DT".equals(tagname) || "DD".equals(tagname)) {
- closeOpenTags("DT", stopDeflist, 6);
- closeOpenTags("DL", stopDeflist, 6);
- } else if ("OPTION".equals(tagname)) {
- closeOpenTags("OPTION", stopNone, 1);
- } else if ("P".equals(tagname)) {
- closeOpenTags("P", stopNone, 1);
- }
-
- stack.push(tagname);
-
- try {
- builder.startElement(tagname, attributes);
- } catch (SAXException x) {
- System.err.println("Error in handleStartTag");
- }
- }
-
- /**
- * Handle End Tag.
- */
- protected void handleEndTag(TagElement tag) {
- // System.err.println ("handleEndTag ("+tag.getHTMLTag()+")");
- String tagname = tag.getHTMLTag().toString().toUpperCase();
-
- try {
- if (tagname.equals(stack.peek())) {
- stack.pop();
- }
- } catch (EmptyStackException es) {
- }
-
- try {
- builder.endElement(tagname);
- } catch (SAXException x) {
- System.err.println("Error in handleEndTag: " + x);
- }
- }
-
- /**
- * Handle Empty Tag.
- */
- protected void handleEmptyTag(TagElement tag) {
- // System.err.println ("handleEmptyTag ("+tag.getHTMLTag()+")");
- attributes.convert(getAttributes());
- flushAttributes();
-
- String tagname = tag.getHTMLTag().toString().toUpperCase();
-
- try {
- builder.startElement(tagname, attributes);
- builder.endElement(tagname);
- } catch (SAXException x) {
- System.err.println("Error in handleEmptyTag: " + x);
- }
- }
-
- /**
- * Handle Text.
- */
- protected void handleText(char[] data) {
- // System.err.println ("handleText ("+new String (data)+")");
- try {
- builder.characters(data, 0, data.length);
- } catch (SAXException x) {
- System.err.println("Error in handleText");
- }
- }
-
- /*
- * Error handling.
- */
- protected void handleError(int ln, String errorMsg) {
- // System.err.println ("handleError ("+ln+": "+errorMsg+")");
- }
-
- /**
- * Handle comment.
- */
- protected void handleComment(char[] data) {
- // System.err.println ("handleComment ("+new String (data)+")");
-
- /* try {
- builder.characters (data, 0, data.length);
- } catch (SAXException x) {
- System.err.println ("Error in handleComment");
- }*/
- }
-
- /**
- *
- *
- * @return ...
- */
- public HTMLDocument getDocument() {
- try {
- builder.endDocument();
- } catch (SAXException x) {
- }
-
- return builder.getHTMLDocument();
- }
-
- private void closeOpenTags(String until, HashSet stoppers, int maxdepth) {
- int l = stack.size();
- int stop = Math.max(0, l - maxdepth);
- int found = -1;
-
- for (int i = l - 1; i >= stop; i--) {
- Object o = stack.elementAt(i);
-
- if (stoppers.contains(o)) {
- return;
- }
-
- if (until.equals(o)) {
- found = i;
-
- break;
- }
- }
-
- if (found > -1) {
- for (int i = l - 1; i >= found; i--) {
- try {
- String t = (String) stack.pop();
-
- builder.endElement(t);
- } catch (Exception x) {
- }
- }
- }
- }
-
- class Attributes implements org.xml.sax.AttributeList {
- HashMap map = new HashMap();
- ArrayList names = new ArrayList();
- ArrayList values = new ArrayList();
-
- public int getLength() {
- return names.size();
- }
-
- public String getName(int i) {
- return (String) names.get(i);
- }
-
- public String getType(int i) {
- return "CDATA";
- }
-
- public String getType(String name) {
- return "CDATA";
- }
-
- public String getValue(int i) {
- return (String) values.get(i);
- }
-
- public String getValue(String name) {
- return (String) map.get(name);
- }
-
- public void convert(SimpleAttributeSet attset) {
- map.clear();
- names.clear();
- values.clear();
-
- for (Enumeration e = attset.getAttributeNames(); e.hasMoreElements();) {
- Object name = e.nextElement();
- Object value = attset.getAttribute(name).toString();
-
- name = name.toString().toLowerCase();
- map.put(name, value);
- names.add(name);
- values.add(value);
- }
- }
- }
-}
diff --git a/src/helma/util/XmlUtils.java b/src/helma/util/XmlUtils.java
index 055d5dfa..da85e62e 100644
--- a/src/helma/util/XmlUtils.java
+++ b/src/helma/util/XmlUtils.java
@@ -17,8 +17,13 @@
package helma.util;
import org.w3c.dom.Document;
+import org.w3c.dom.html.HTMLDocument;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.XMLReaderAdapter;
+import org.ccil.cowan.tagsoup.Parser;
+import org.apache.html.dom.HTMLBuilder;
+
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
@@ -55,7 +60,7 @@ public class XmlUtils {
}
DocumentBuilder parser = domBuilderFactory.newDocumentBuilder();
- Document doc = null;
+ Document doc;
if (obj instanceof String) {
try {
@@ -71,10 +76,11 @@ public class XmlUtils {
doc = parser.parse(new InputSource((InputStream) obj));
} else if (obj instanceof Reader) {
doc = parser.parse(new InputSource((Reader) obj));
+ } else {
+ throw new RuntimeException("Unrecognized argument to parseXml: " + obj);
}
doc.normalize();
-
return doc;
}
@@ -87,36 +93,39 @@ public class XmlUtils {
*
* @throws IOException ...
*/
- public static Document parseHtml(Object obj)
- throws IOException {
+ public static HTMLDocument parseHtml(Object obj)
+ throws IOException, SAXException {
try {
- Class.forName("org.apache.html.dom.HTMLBuilder");
+ Class.forName("org.apache.html.dom.HTMLDocumentImpl");
} catch (Throwable notfound) {
- throw new IOException("Couldn't load nekohtml/Xerces HTML parser: " +
- notfound);
+ throw new RuntimeException("Couldn't load Xerces HTML DOM classes. " +
+ "Make sure you have xercesImpl.jar and xml-apis.jar in your classpath.");
}
- Document doc = null;
- HtmlParser parser = new HtmlParser();
-
if (obj instanceof String) {
try {
// first try to interpret string as URL
URL url = new URL(obj.toString());
-
- parser.parse(new InputStreamReader(url.openStream()));
+ return getHtmlDocument(new InputStreamReader(url.openStream()));
} catch (MalformedURLException nourl) {
// if not a URL, maybe it is the XML itself
- parser.parse(new StringReader(obj.toString()));
+ return getHtmlDocument(new StringReader(obj.toString()));
}
} else if (obj instanceof InputStream) {
- parser.parse(new InputStreamReader((InputStream) obj));
+ return getHtmlDocument(new InputStreamReader((InputStream) obj));
} else if (obj instanceof Reader) {
- parser.parse((Reader) obj);
+ return getHtmlDocument((Reader) obj);
+ } else {
+ throw new RuntimeException("Unrecognized argument to parseHtml: " + obj);
}
+ }
- doc = parser.getDocument();
-
- return doc;
+ private static HTMLDocument getHtmlDocument(Reader reader)
+ throws IOException, SAXException {
+ XMLReaderAdapter parser = new XMLReaderAdapter(new Parser());
+ HTMLBuilder builder = new HTMLBuilder();
+ parser.setDocumentHandler(builder);
+ parser.parse(new InputSource(reader));
+ return builder.getHTMLDocument();
}
}