Switch to standard Java (JAXP) XML parser for getXmlDocument()

and provide our own HTML parser for getHtmlDocument which is a thin bridge between javax.swing.text.html.parser.* and the Apache HTML-DOM implementation.
2002-06-03 20:15:16 +00:00 · 2002-06-03 20:15:16 +00:00 · 15d7342ea4
commit 15d7342ea4
parent a699facf6d
3 changed files with 251 additions and 35 deletions
--- a/src/helma/scripting/fesi/HopExtension.java
+++ b/src/helma/scripting/fesi/HopExtension.java
@ -654,24 +654,9 @@ public class HopExtension {
        }
        public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
            try {
                // Class.forName ("org.apache.xerces.parsers.DOMParser");
                // org.apache.xerces.parsers.DOMParser parser = new org.apache.xerces.parsers.DOMParser();
                Class.forName ("org.openxml.parser.XMLParser");
                org.openxml.parser.XMLParser parser = new org.openxml.parser.XMLParser();
                Object p = arguments[0].toJavaObject ();
-                if (p instanceof String) try {
+                Object doc = helma.util.XmlUtils.parseXml (p);
-                   // first try to interpret string as URL
+                return ESLoader.normalizeObject (doc, evaluator);
                   java.net.URL u = new java.net.URL (p.toString ());
                   parser.parse (p.toString());
                } catch (java.net.MalformedURLException nourl) {
                   // if not a URL, maybe it is the XML itself
                   parser.parse (new InputSource (new StringReader (p.toString())));
                }
                else if (p instanceof InputStream)
                   parser.parse (new InputSource ((InputStream) p));
                else if (p instanceof Reader)
                   parser.parse (new InputSource ((Reader) p));
                return ESLoader.normalizeObject (parser.getDocument(), evaluator);
            } catch (Exception noluck) {
                app.logEvent ("Error creating XML document: "+noluck);
            }
@ -685,22 +670,9 @@ public class HopExtension {
        }
        public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
            try {
                Class.forName ("org.openxml.parser.HTMLParser");
                org.openxml.parser.HTMLParser parser = new org.openxml.parser.HTMLParser();
                Object p = arguments[0].toJavaObject ();
-                if (p instanceof String) try {
+                Object doc = helma.util.XmlUtils.parseHtml (p);
-                   // first try to interpret string as URL
+                return ESLoader.normalizeObject (doc, evaluator);
                   java.net.URL u = new java.net.URL (p.toString ());
                   parser.parse (p.toString());
                } catch (java.net.MalformedURLException nourl) {
                   // if not a URL, maybe it is the HTML itself
                   parser.parse (new InputSource (new StringReader (p.toString())));
                }
                else if (p instanceof InputStream)
                   parser.parse (new InputSource ((InputStream) p));
                else if (p instanceof Reader)
                   parser.parse (new InputSource ((Reader) p));
                return ESLoader.normalizeObject (parser.getDocument(), evaluator);
            } catch (Exception noluck) {
                app.logEvent ("Error creating HTML document: "+noluck);
            }
@ -714,13 +686,13 @@ public class HopExtension {
        }
        public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
            try {
-	   Class.forName ("org.w3c.dom.Document");
+                Class.forName ("org.w3c.dom.Document");
                org.w3c.dom.Document doc = (org.w3c.dom.Document) arguments[0].toJavaObject ();
                Class.forName ("org.jdom.input.DOMBuilder");
-	   org.jdom.input.DOMBuilder builder = new org.jdom.input.DOMBuilder ();
+                org.jdom.input.DOMBuilder builder = new org.jdom.input.DOMBuilder ();
                return ESLoader.normalizeObject (builder.build (doc), evaluator);
            } catch (Exception noluck) {
-                app.logEvent ("Error wrapping JDOM document: "+noluck);
+                app.logEvent ("Error building JDOM document: "+noluck);
            }
            return ESNull.theNull;
        }
--- a/src/helma/util/HtmlParser.java
+++ b/src/helma/util/HtmlParser.java
@ -0,0 +1,166 @@
 // HtmlParser.java
 // Copyright (c) Hannes Wallnöfer 2002
 package helma.util;
 import java.util.HashMap;
 import java.util.ArrayList;
 import java.util.Enumeration;
 import java.io.IOException;
 import javax.swing.text.html.parser.*;
 import javax.swing.text.SimpleAttributeSet;
 import org.xml.sax.SAXException;
 import org.apache.html.dom.*;
 import org.w3c.dom.html.HTMLDocument;
 public class HtmlParser extends Parser {
    HTMLBuilder builder;
    Attributes attributes = new Attributes ();
    public HtmlParser () throws IOException {
 	super (DTD.getDTD ("html32"));
 	dtd.getElement ("table");
 	dtd.getElement ("tr");
 	dtd.getElement ("td");
 	dtd.getElement ("span");
 	dtd.getElement ("div");
 	dtd.getElement ("font");
 	dtd.getElement ("b");
 	dtd.getElement ("i");
 	dtd.getElement ("a");
 	dtd.getElement ("blockquote");
 	dtd.getElement ("em");
 	dtd.getElement ("ul");
 	dtd.getElement ("ol");
 	dtd.getElement ("li");
 	dtd.elementHash.remove ("meta");
 	dtd.elementHash.remove ("link");
 	dtd.elementHash.remove ("base");
 	builder = new HTMLBuilder ();
 	try {
 	    builder.startDocument ();
 	} catch (SAXException x) {
 	    System.err.println ("Error in constructor");
 	}
    }
    /**
     * Handle Start Tag.
     */
    protected void handleStartTag(TagElement tag) {
 	// System.err.println ("handleStartTag ("+tag.getHTMLTag()+")");
 	attributes.convert (getAttributes());
 	flushAttributes();
 	try {
 	    builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes);
 	} catch (SAXException x) {
 	    System.err.println ("Error in handleStartTag");
 	}
    }
    /**
     * Handle Empty Tag.
     */
    protected void handleEmptyTag(TagElement tag) {
 	// System.err.println ("handleEmptyTag ("+tag.getHTMLTag()+")");
 	attributes.convert (getAttributes());
 	flushAttributes();
 	try {
 	    builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes);
 	    builder.endElement (tag.getHTMLTag().toString().toUpperCase());
 	} catch (SAXException x) {
 	    System.err.println ("Error in handleEmptyTag: "+x);
 	}
    }
    /**
     * Handle End Tag.
     */
    protected void handleEndTag(TagElement tag) {
 	// System.err.println ("handleEndTag ("+tag.getHTMLTag()+")");
 	try {
 	    builder.endElement (tag.getHTMLTag().toString().toUpperCase());
 	} catch (SAXException x) {
 	    System.err.println ("Error in handleEndTag: "+x);
 	}
    }
    /**
     * Handle Text.
     */
    protected void handleText(char data[]) {
 	// System.err.println ("handleText ("+new String (data)+")");
 	try {
 	    builder.characters (data, 0, data.length);
 	} catch (SAXException x) {
 	    System.err.println ("Error in handleText");
 	}
    }
    /*
     * Error handling.
     */
    protected void handleError(int ln, String errorMsg) {
 	// System.err.println ("handleError ("+ln+": "+errorMsg+")");
    }
    /**
     *  Handle comment.
     */
    protected void handleComment(char text[]) {
    }
    public HTMLDocument getDocument () {
 	try {
 	    builder.endDocument ();
 	} catch (SAXException x) {}
 	return builder.getHTMLDocument ();
    }
    class Attributes implements org.xml.sax.AttributeList {
 	HashMap map = new HashMap();
 	ArrayList names = new ArrayList();
 	ArrayList values = new ArrayList ();
 	public int getLength() {
 	    return names.size();
 	}
 	public String getName (int i) {
 	    return (String) names.get (i);
 	}
 	public String getType (int i) {
 	    return "CDATA";
 	}
 	public String getType (String name) {
 	    return "CDATA";
 	}
 	public String getValue (int i) {
 	    return (String) values.get (i);
 	}
 	public String getValue (String name) {
 	    return (String) map.get (name);
 	}
 	public void convert (SimpleAttributeSet attset) {
 	    map.clear ();
 	    names.clear ();
 	    values.clear ();
 	    for (Enumeration e = attset.getAttributeNames(); e.hasMoreElements(); ) {
 	        Object name = e.nextElement ();
 	        Object value = attset.getAttribute (name).toString();
 	        map.put (name.toString(), value);
 	        names.add (name.toString());
 	        values.add (value);
 	    }
 	}
    }
 }
--- a/src/helma/util/XmlUtils.java
+++ b/src/helma/util/XmlUtils.java
@ -0,0 +1,78 @@
 package helma.util;
 import java.io.InputStream;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.io.InputStreamReader;
 import java.net.URL;
 import java.net.MalformedURLException;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.SAXParserFactory;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.ParserConfigurationException;
 import org.w3c.dom.Document;
 import org.xml.sax.Parser;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 public class XmlUtils {
    private static DocumentBuilderFactory domBuilderFactory = null;
    private static SAXParserFactory saxParserFactory = null;
    public static Document parseXml (Object obj)
 		throws SAXException, IOException, ParserConfigurationException {
 	if (domBuilderFactory == null)
 	    domBuilderFactory = javax.xml.parsers.DocumentBuilderFactory.newInstance();
 	DocumentBuilder parser = domBuilderFactory.newDocumentBuilder();
 	Document doc = null;
 	if (obj instanceof String) try {
 	    // first try to interpret string as URL
 	    URL url = new URL (obj.toString ());
 	    doc = parser.parse (obj.toString());
 	} catch (MalformedURLException nourl) {
 	    // if not a URL, maybe it is the XML itself
 	    doc = parser.parse (new InputSource (new StringReader (obj.toString())));
 	} else if (obj instanceof InputStream) {
 	    doc = parser.parse (new InputSource ((InputStream) obj));
 	} else if (obj instanceof Reader) {
 	    doc = parser.parse (new InputSource ((Reader) obj));
 	}
 	doc.normalize();
 	return doc;
    }
    public static Document parseHtml (Object obj)
 		throws SAXException, IOException, ParserConfigurationException {
 	try {
 	    Class.forName ("org.apache.html.dom.HTMLBuilder");
 	} catch (Throwable notfound) {
 	    throw new IOException ("Couldn't load nekohtml/Xerces HTML parser: "+notfound);
 	}
 	Document doc = null;
 	HtmlParser parser = new HtmlParser ();
 	if (obj instanceof String) try {
 	    // first try to interpret string as URL
 	    URL url = new URL (obj.toString ());
 	    parser.parse (new InputStreamReader (url.openStream()));
 	} catch (MalformedURLException nourl) {
 	    // if not a URL, maybe it is the XML itself
 	    parser.parse (new StringReader (obj.toString()));
 	} else if (obj instanceof InputStream) {
 	    parser.parse (new InputStreamReader ((InputStream) obj));
 	} else if (obj instanceof Reader) {
 	    parser.parse ((Reader) obj);
 	}
 	doc = parser.getDocument ();
 	return doc;
    }
 }