Switch to standard Java (JAXP) XML parser for getXmlDocument()

and provide our own HTML parser for getHtmlDocument which is a
thin bridge between javax.swing.text.html.parser.* and the Apache
HTML-DOM implementation.
This commit is contained in:
hns 2002-06-03 20:15:16 +00:00
parent a699facf6d
commit 15d7342ea4
3 changed files with 251 additions and 35 deletions

View file

@ -654,24 +654,9 @@ public class HopExtension {
} }
public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException { public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
try { try {
// Class.forName ("org.apache.xerces.parsers.DOMParser");
// org.apache.xerces.parsers.DOMParser parser = new org.apache.xerces.parsers.DOMParser();
Class.forName ("org.openxml.parser.XMLParser");
org.openxml.parser.XMLParser parser = new org.openxml.parser.XMLParser();
Object p = arguments[0].toJavaObject (); Object p = arguments[0].toJavaObject ();
if (p instanceof String) try { Object doc = helma.util.XmlUtils.parseXml (p);
// first try to interpret string as URL return ESLoader.normalizeObject (doc, evaluator);
java.net.URL u = new java.net.URL (p.toString ());
parser.parse (p.toString());
} catch (java.net.MalformedURLException nourl) {
// if not a URL, maybe it is the XML itself
parser.parse (new InputSource (new StringReader (p.toString())));
}
else if (p instanceof InputStream)
parser.parse (new InputSource ((InputStream) p));
else if (p instanceof Reader)
parser.parse (new InputSource ((Reader) p));
return ESLoader.normalizeObject (parser.getDocument(), evaluator);
} catch (Exception noluck) { } catch (Exception noluck) {
app.logEvent ("Error creating XML document: "+noluck); app.logEvent ("Error creating XML document: "+noluck);
} }
@ -685,22 +670,9 @@ public class HopExtension {
} }
public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException { public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
try { try {
Class.forName ("org.openxml.parser.HTMLParser");
org.openxml.parser.HTMLParser parser = new org.openxml.parser.HTMLParser();
Object p = arguments[0].toJavaObject (); Object p = arguments[0].toJavaObject ();
if (p instanceof String) try { Object doc = helma.util.XmlUtils.parseHtml (p);
// first try to interpret string as URL return ESLoader.normalizeObject (doc, evaluator);
java.net.URL u = new java.net.URL (p.toString ());
parser.parse (p.toString());
} catch (java.net.MalformedURLException nourl) {
// if not a URL, maybe it is the HTML itself
parser.parse (new InputSource (new StringReader (p.toString())));
}
else if (p instanceof InputStream)
parser.parse (new InputSource ((InputStream) p));
else if (p instanceof Reader)
parser.parse (new InputSource ((Reader) p));
return ESLoader.normalizeObject (parser.getDocument(), evaluator);
} catch (Exception noluck) { } catch (Exception noluck) {
app.logEvent ("Error creating HTML document: "+noluck); app.logEvent ("Error creating HTML document: "+noluck);
} }
@ -714,13 +686,13 @@ public class HopExtension {
} }
public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException { public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
try { try {
Class.forName ("org.w3c.dom.Document"); Class.forName ("org.w3c.dom.Document");
org.w3c.dom.Document doc = (org.w3c.dom.Document) arguments[0].toJavaObject (); org.w3c.dom.Document doc = (org.w3c.dom.Document) arguments[0].toJavaObject ();
Class.forName ("org.jdom.input.DOMBuilder"); Class.forName ("org.jdom.input.DOMBuilder");
org.jdom.input.DOMBuilder builder = new org.jdom.input.DOMBuilder (); org.jdom.input.DOMBuilder builder = new org.jdom.input.DOMBuilder ();
return ESLoader.normalizeObject (builder.build (doc), evaluator); return ESLoader.normalizeObject (builder.build (doc), evaluator);
} catch (Exception noluck) { } catch (Exception noluck) {
app.logEvent ("Error wrapping JDOM document: "+noluck); app.logEvent ("Error building JDOM document: "+noluck);
} }
return ESNull.theNull; return ESNull.theNull;
} }

View file

@ -0,0 +1,166 @@
// HtmlParser.java
// Copyright (c) Hannes Wallnöfer 2002
package helma.util;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.Enumeration;
import java.io.IOException;
import javax.swing.text.html.parser.*;
import javax.swing.text.SimpleAttributeSet;
import org.xml.sax.SAXException;
import org.apache.html.dom.*;
import org.w3c.dom.html.HTMLDocument;
public class HtmlParser extends Parser {
HTMLBuilder builder;
Attributes attributes = new Attributes ();
public HtmlParser () throws IOException {
super (DTD.getDTD ("html32"));
dtd.getElement ("table");
dtd.getElement ("tr");
dtd.getElement ("td");
dtd.getElement ("span");
dtd.getElement ("div");
dtd.getElement ("font");
dtd.getElement ("b");
dtd.getElement ("i");
dtd.getElement ("a");
dtd.getElement ("blockquote");
dtd.getElement ("em");
dtd.getElement ("ul");
dtd.getElement ("ol");
dtd.getElement ("li");
dtd.elementHash.remove ("meta");
dtd.elementHash.remove ("link");
dtd.elementHash.remove ("base");
builder = new HTMLBuilder ();
try {
builder.startDocument ();
} catch (SAXException x) {
System.err.println ("Error in constructor");
}
}
/**
* Handle Start Tag.
*/
protected void handleStartTag(TagElement tag) {
// System.err.println ("handleStartTag ("+tag.getHTMLTag()+")");
attributes.convert (getAttributes());
flushAttributes();
try {
builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes);
} catch (SAXException x) {
System.err.println ("Error in handleStartTag");
}
}
/**
* Handle Empty Tag.
*/
protected void handleEmptyTag(TagElement tag) {
// System.err.println ("handleEmptyTag ("+tag.getHTMLTag()+")");
attributes.convert (getAttributes());
flushAttributes();
try {
builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes);
builder.endElement (tag.getHTMLTag().toString().toUpperCase());
} catch (SAXException x) {
System.err.println ("Error in handleEmptyTag: "+x);
}
}
/**
* Handle End Tag.
*/
protected void handleEndTag(TagElement tag) {
// System.err.println ("handleEndTag ("+tag.getHTMLTag()+")");
try {
builder.endElement (tag.getHTMLTag().toString().toUpperCase());
} catch (SAXException x) {
System.err.println ("Error in handleEndTag: "+x);
}
}
/**
* Handle Text.
*/
protected void handleText(char data[]) {
// System.err.println ("handleText ("+new String (data)+")");
try {
builder.characters (data, 0, data.length);
} catch (SAXException x) {
System.err.println ("Error in handleText");
}
}
/*
* Error handling.
*/
protected void handleError(int ln, String errorMsg) {
// System.err.println ("handleError ("+ln+": "+errorMsg+")");
}
/**
* Handle comment.
*/
protected void handleComment(char text[]) {
}
public HTMLDocument getDocument () {
try {
builder.endDocument ();
} catch (SAXException x) {}
return builder.getHTMLDocument ();
}
class Attributes implements org.xml.sax.AttributeList {
HashMap map = new HashMap();
ArrayList names = new ArrayList();
ArrayList values = new ArrayList ();
public int getLength() {
return names.size();
}
public String getName (int i) {
return (String) names.get (i);
}
public String getType (int i) {
return "CDATA";
}
public String getType (String name) {
return "CDATA";
}
public String getValue (int i) {
return (String) values.get (i);
}
public String getValue (String name) {
return (String) map.get (name);
}
public void convert (SimpleAttributeSet attset) {
map.clear ();
names.clear ();
values.clear ();
for (Enumeration e = attset.getAttributeNames(); e.hasMoreElements(); ) {
Object name = e.nextElement ();
Object value = attset.getAttribute (name).toString();
map.put (name.toString(), value);
names.add (name.toString());
values.add (value);
}
}
}
}

View file

@ -0,0 +1,78 @@
package helma.util;
import java.io.InputStream;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.MalformedURLException;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.xml.sax.Parser;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class XmlUtils {
private static DocumentBuilderFactory domBuilderFactory = null;
private static SAXParserFactory saxParserFactory = null;
public static Document parseXml (Object obj)
throws SAXException, IOException, ParserConfigurationException {
if (domBuilderFactory == null)
domBuilderFactory = javax.xml.parsers.DocumentBuilderFactory.newInstance();
DocumentBuilder parser = domBuilderFactory.newDocumentBuilder();
Document doc = null;
if (obj instanceof String) try {
// first try to interpret string as URL
URL url = new URL (obj.toString ());
doc = parser.parse (obj.toString());
} catch (MalformedURLException nourl) {
// if not a URL, maybe it is the XML itself
doc = parser.parse (new InputSource (new StringReader (obj.toString())));
} else if (obj instanceof InputStream) {
doc = parser.parse (new InputSource ((InputStream) obj));
} else if (obj instanceof Reader) {
doc = parser.parse (new InputSource ((Reader) obj));
}
doc.normalize();
return doc;
}
public static Document parseHtml (Object obj)
throws SAXException, IOException, ParserConfigurationException {
try {
Class.forName ("org.apache.html.dom.HTMLBuilder");
} catch (Throwable notfound) {
throw new IOException ("Couldn't load nekohtml/Xerces HTML parser: "+notfound);
}
Document doc = null;
HtmlParser parser = new HtmlParser ();
if (obj instanceof String) try {
// first try to interpret string as URL
URL url = new URL (obj.toString ());
parser.parse (new InputStreamReader (url.openStream()));
} catch (MalformedURLException nourl) {
// if not a URL, maybe it is the XML itself
parser.parse (new StringReader (obj.toString()));
} else if (obj instanceof InputStream) {
parser.parse (new InputStreamReader ((InputStream) obj));
} else if (obj instanceof Reader) {
parser.parse ((Reader) obj);
}
doc = parser.getDocument ();
return doc;
}
}