Switch to standard Java (JAXP) XML parser for getXmlDocument()

and provide our own HTML parser for getHtmlDocument which is a thin bridge between javax.swing.text.html.parser.* and the Apache HTML-DOM implementation.
2002-06-03 20:15:16 +00:00 · 2002-06-03 20:15:16 +00:00 · 15d7342ea4
commit 15d7342ea4
parent a699facf6d
3 changed files with 251 additions and 35 deletions
--- a/src/helma/scripting/fesi/HopExtension.java
+++ b/src/helma/scripting/fesi/HopExtension.java
@ -654,24 +654,9 @@ public class HopExtension {
        }
        public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
            try {
-                // Class.forName ("org.apache.xerces.parsers.DOMParser");
-                // org.apache.xerces.parsers.DOMParser parser = new org.apache.xerces.parsers.DOMParser();
-                Class.forName ("org.openxml.parser.XMLParser");
-                org.openxml.parser.XMLParser parser = new org.openxml.parser.XMLParser();
                Object p = arguments[0].toJavaObject ();
-                if (p instanceof String) try {
-                   // first try to interpret string as URL
-                   java.net.URL u = new java.net.URL (p.toString ());
-                   parser.parse (p.toString());
-                } catch (java.net.MalformedURLException nourl) {
-                   // if not a URL, maybe it is the XML itself
-                   parser.parse (new InputSource (new StringReader (p.toString())));
-                }
-                else if (p instanceof InputStream)
-                   parser.parse (new InputSource ((InputStream) p));
-                else if (p instanceof Reader)
-                   parser.parse (new InputSource ((Reader) p));
-                return ESLoader.normalizeObject (parser.getDocument(), evaluator);
+                Object doc = helma.util.XmlUtils.parseXml (p);
+                return ESLoader.normalizeObject (doc, evaluator);
            } catch (Exception noluck) {
                app.logEvent ("Error creating XML document: "+noluck);
            }
@ -685,22 +670,9 @@ public class HopExtension {
        }
        public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
            try {
-                Class.forName ("org.openxml.parser.HTMLParser");
-                org.openxml.parser.HTMLParser parser = new org.openxml.parser.HTMLParser();
                Object p = arguments[0].toJavaObject ();
-                if (p instanceof String) try {
-                   // first try to interpret string as URL
-                   java.net.URL u = new java.net.URL (p.toString ());
-                   parser.parse (p.toString());
-                } catch (java.net.MalformedURLException nourl) {
-                   // if not a URL, maybe it is the HTML itself
-                   parser.parse (new InputSource (new StringReader (p.toString())));
-                }
-                else if (p instanceof InputStream)
-                   parser.parse (new InputSource ((InputStream) p));
-                else if (p instanceof Reader)
-                   parser.parse (new InputSource ((Reader) p));
-                return ESLoader.normalizeObject (parser.getDocument(), evaluator);
+                Object doc = helma.util.XmlUtils.parseHtml (p);
+                return ESLoader.normalizeObject (doc, evaluator);
            } catch (Exception noluck) {
                app.logEvent ("Error creating HTML document: "+noluck);
            }
@ -714,13 +686,13 @@ public class HopExtension {
        }
        public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
            try {
-	   Class.forName ("org.w3c.dom.Document");
+                Class.forName ("org.w3c.dom.Document");
                org.w3c.dom.Document doc = (org.w3c.dom.Document) arguments[0].toJavaObject ();
                Class.forName ("org.jdom.input.DOMBuilder");
-	   org.jdom.input.DOMBuilder builder = new org.jdom.input.DOMBuilder ();
+                org.jdom.input.DOMBuilder builder = new org.jdom.input.DOMBuilder ();
                return ESLoader.normalizeObject (builder.build (doc), evaluator);
            } catch (Exception noluck) {
-                app.logEvent ("Error wrapping JDOM document: "+noluck);
+                app.logEvent ("Error building JDOM document: "+noluck);
            }
            return ESNull.theNull;
        }
--- a/src/helma/util/HtmlParser.java
+++ b/src/helma/util/HtmlParser.java
@ -0,0 +1,166 @@
+// HtmlParser.java
+// Copyright (c) Hannes Wallnöfer 2002
+
+package helma.util;
+
+import java.util.HashMap;
+import java.util.ArrayList;
+import java.util.Enumeration;
+import java.io.IOException;
+import javax.swing.text.html.parser.*;
+import javax.swing.text.SimpleAttributeSet;
+import org.xml.sax.SAXException;
+import org.apache.html.dom.*;
+import org.w3c.dom.html.HTMLDocument;
+
+public class HtmlParser extends Parser {
+
+    HTMLBuilder builder;
+    Attributes attributes = new Attributes ();
+
+    public HtmlParser () throws IOException {
+	super (DTD.getDTD ("html32"));
+	dtd.getElement ("table");
+	dtd.getElement ("tr");
+	dtd.getElement ("td");
+	dtd.getElement ("span");
+	dtd.getElement ("div");
+	dtd.getElement ("font");
+	dtd.getElement ("b");
+	dtd.getElement ("i");
+	dtd.getElement ("a");
+	dtd.getElement ("blockquote");
+	dtd.getElement ("em");
+	dtd.getElement ("ul");
+	dtd.getElement ("ol");
+	dtd.getElement ("li");
+	dtd.elementHash.remove ("meta");
+	dtd.elementHash.remove ("link");
+	dtd.elementHash.remove ("base");
+	builder = new HTMLBuilder ();
+	try {
+	    builder.startDocument ();
+	} catch (SAXException x) {
+	    System.err.println ("Error in constructor");
+	}
+    }
+
+    /**
+     * Handle Start Tag.
+     */
+    protected void handleStartTag(TagElement tag) {
+	// System.err.println ("handleStartTag ("+tag.getHTMLTag()+")");
+	attributes.convert (getAttributes());
+	flushAttributes();
+	try {
+	    builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes);
+	} catch (SAXException x) {
+	    System.err.println ("Error in handleStartTag");
+	}
+    }
+
+    /**
+     * Handle Empty Tag.
+     */
+    protected void handleEmptyTag(TagElement tag) {
+	// System.err.println ("handleEmptyTag ("+tag.getHTMLTag()+")");
+	attributes.convert (getAttributes());
+	flushAttributes();
+	try {
+	    builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes);
+	    builder.endElement (tag.getHTMLTag().toString().toUpperCase());
+	} catch (SAXException x) {
+	    System.err.println ("Error in handleEmptyTag: "+x);
+	}
+    }
+
+    /**
+     * Handle End Tag.
+     */
+    protected void handleEndTag(TagElement tag) {
+	// System.err.println ("handleEndTag ("+tag.getHTMLTag()+")");
+	try {
+	    builder.endElement (tag.getHTMLTag().toString().toUpperCase());
+	} catch (SAXException x) {
+	    System.err.println ("Error in handleEndTag: "+x);
+	}
+    }
+
+    /**
+     * Handle Text.
+     */
+    protected void handleText(char data[]) {
+	// System.err.println ("handleText ("+new String (data)+")");
+	try {
+	    builder.characters (data, 0, data.length);
+	} catch (SAXException x) {
+	    System.err.println ("Error in handleText");
+	}
+    }
+
+    /*
+     * Error handling.
+     */
+    protected void handleError(int ln, String errorMsg) {
+	// System.err.println ("handleError ("+ln+": "+errorMsg+")");
+    }
+
+    /**
+     *  Handle comment.
+     */
+    protected void handleComment(char text[]) {
+    }
+
+    public HTMLDocument getDocument () {
+	try {
+	    builder.endDocument ();
+	} catch (SAXException x) {}
+	return builder.getHTMLDocument ();
+    }
+
+
+    class Attributes implements org.xml.sax.AttributeList {
+	HashMap map = new HashMap();
+	ArrayList names = new ArrayList();
+	ArrayList values = new ArrayList ();
+
+	public int getLength() {
+	    return names.size();
+	}
+
+	public String getName (int i) {
+	    return (String) names.get (i);
+	}
+
+	public String getType (int i) {
+	    return "CDATA";
+	}
+
+	public String getType (String name) {
+	    return "CDATA";
+	}
+
+	public String getValue (int i) {
+	    return (String) values.get (i);
+	}
+
+	public String getValue (String name) {
+	    return (String) map.get (name);
+	}
+
+	public void convert (SimpleAttributeSet attset) {
+	    map.clear ();
+	    names.clear ();
+	    values.clear ();
+	    for (Enumeration e = attset.getAttributeNames(); e.hasMoreElements(); ) {
+	        Object name = e.nextElement ();
+	        Object value = attset.getAttribute (name).toString();
+	        map.put (name.toString(), value);
+	        names.add (name.toString());
+	        values.add (value);
+	    }
+	}
+
+    }
+}
+
--- a/src/helma/util/XmlUtils.java
+++ b/src/helma/util/XmlUtils.java
@ -0,0 +1,78 @@
+package helma.util;
+
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.MalformedURLException;
+
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.SAXParserFactory;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.Document;
+import org.xml.sax.Parser;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+
+public class XmlUtils {
+
+    private static DocumentBuilderFactory domBuilderFactory = null;
+    private static SAXParserFactory saxParserFactory = null;
+
+    public static Document parseXml (Object obj)
+		throws SAXException, IOException, ParserConfigurationException {
+	if (domBuilderFactory == null)
+	    domBuilderFactory = javax.xml.parsers.DocumentBuilderFactory.newInstance();
+	DocumentBuilder parser = domBuilderFactory.newDocumentBuilder();
+	Document doc = null;
+	if (obj instanceof String) try {
+	    // first try to interpret string as URL
+	    URL url = new URL (obj.toString ());
+	    doc = parser.parse (obj.toString());
+	} catch (MalformedURLException nourl) {
+	    // if not a URL, maybe it is the XML itself
+	    doc = parser.parse (new InputSource (new StringReader (obj.toString())));
+	} else if (obj instanceof InputStream) {
+	    doc = parser.parse (new InputSource ((InputStream) obj));
+	} else if (obj instanceof Reader) {
+	    doc = parser.parse (new InputSource ((Reader) obj));
+	}
+	doc.normalize();
+	return doc;
+    }
+
+
+    public static Document parseHtml (Object obj)
+		throws SAXException, IOException, ParserConfigurationException {
+	try {
+	    Class.forName ("org.apache.html.dom.HTMLBuilder");
+	} catch (Throwable notfound) {
+	    throw new IOException ("Couldn't load nekohtml/Xerces HTML parser: "+notfound);
+	}
+	Document doc = null;
+	HtmlParser parser = new HtmlParser ();
+	if (obj instanceof String) try {
+	    // first try to interpret string as URL
+	    URL url = new URL (obj.toString ());
+	    parser.parse (new InputStreamReader (url.openStream()));
+	} catch (MalformedURLException nourl) {
+	    // if not a URL, maybe it is the XML itself
+	    parser.parse (new StringReader (obj.toString()));
+	} else if (obj instanceof InputStream) {
+	    parser.parse (new InputStreamReader ((InputStream) obj));
+	} else if (obj instanceof Reader) {
+	    parser.parse ((Reader) obj);
+	}
+	doc = parser.getDocument ();
+	return doc;
+    }
+
+
+}