Switch to standard Java (JAXP) XML parser for getXmlDocument()
and provide our own HTML parser for getHtmlDocument which is a thin bridge between javax.swing.text.html.parser.* and the Apache HTML-DOM implementation.
This commit is contained in:
parent
a699facf6d
commit
15d7342ea4
3 changed files with 251 additions and 35 deletions
|
@ -654,24 +654,9 @@ public class HopExtension {
|
|||
}
|
||||
public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
|
||||
try {
|
||||
// Class.forName ("org.apache.xerces.parsers.DOMParser");
|
||||
// org.apache.xerces.parsers.DOMParser parser = new org.apache.xerces.parsers.DOMParser();
|
||||
Class.forName ("org.openxml.parser.XMLParser");
|
||||
org.openxml.parser.XMLParser parser = new org.openxml.parser.XMLParser();
|
||||
Object p = arguments[0].toJavaObject ();
|
||||
if (p instanceof String) try {
|
||||
// first try to interpret string as URL
|
||||
java.net.URL u = new java.net.URL (p.toString ());
|
||||
parser.parse (p.toString());
|
||||
} catch (java.net.MalformedURLException nourl) {
|
||||
// if not a URL, maybe it is the XML itself
|
||||
parser.parse (new InputSource (new StringReader (p.toString())));
|
||||
}
|
||||
else if (p instanceof InputStream)
|
||||
parser.parse (new InputSource ((InputStream) p));
|
||||
else if (p instanceof Reader)
|
||||
parser.parse (new InputSource ((Reader) p));
|
||||
return ESLoader.normalizeObject (parser.getDocument(), evaluator);
|
||||
Object doc = helma.util.XmlUtils.parseXml (p);
|
||||
return ESLoader.normalizeObject (doc, evaluator);
|
||||
} catch (Exception noluck) {
|
||||
app.logEvent ("Error creating XML document: "+noluck);
|
||||
}
|
||||
|
@ -685,22 +670,9 @@ public class HopExtension {
|
|||
}
|
||||
public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
|
||||
try {
|
||||
Class.forName ("org.openxml.parser.HTMLParser");
|
||||
org.openxml.parser.HTMLParser parser = new org.openxml.parser.HTMLParser();
|
||||
Object p = arguments[0].toJavaObject ();
|
||||
if (p instanceof String) try {
|
||||
// first try to interpret string as URL
|
||||
java.net.URL u = new java.net.URL (p.toString ());
|
||||
parser.parse (p.toString());
|
||||
} catch (java.net.MalformedURLException nourl) {
|
||||
// if not a URL, maybe it is the HTML itself
|
||||
parser.parse (new InputSource (new StringReader (p.toString())));
|
||||
}
|
||||
else if (p instanceof InputStream)
|
||||
parser.parse (new InputSource ((InputStream) p));
|
||||
else if (p instanceof Reader)
|
||||
parser.parse (new InputSource ((Reader) p));
|
||||
return ESLoader.normalizeObject (parser.getDocument(), evaluator);
|
||||
Object doc = helma.util.XmlUtils.parseHtml (p);
|
||||
return ESLoader.normalizeObject (doc, evaluator);
|
||||
} catch (Exception noluck) {
|
||||
app.logEvent ("Error creating HTML document: "+noluck);
|
||||
}
|
||||
|
@ -714,13 +686,13 @@ public class HopExtension {
|
|||
}
|
||||
public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException {
|
||||
try {
|
||||
Class.forName ("org.w3c.dom.Document");
|
||||
Class.forName ("org.w3c.dom.Document");
|
||||
org.w3c.dom.Document doc = (org.w3c.dom.Document) arguments[0].toJavaObject ();
|
||||
Class.forName ("org.jdom.input.DOMBuilder");
|
||||
org.jdom.input.DOMBuilder builder = new org.jdom.input.DOMBuilder ();
|
||||
org.jdom.input.DOMBuilder builder = new org.jdom.input.DOMBuilder ();
|
||||
return ESLoader.normalizeObject (builder.build (doc), evaluator);
|
||||
} catch (Exception noluck) {
|
||||
app.logEvent ("Error wrapping JDOM document: "+noluck);
|
||||
app.logEvent ("Error building JDOM document: "+noluck);
|
||||
}
|
||||
return ESNull.theNull;
|
||||
}
|
||||
|
|
166
src/helma/util/HtmlParser.java
Normal file
166
src/helma/util/HtmlParser.java
Normal file
|
@ -0,0 +1,166 @@
|
|||
// HtmlParser.java
|
||||
// Copyright (c) Hannes Wallnöfer 2002
|
||||
|
||||
package helma.util;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Enumeration;
|
||||
import java.io.IOException;
|
||||
import javax.swing.text.html.parser.*;
|
||||
import javax.swing.text.SimpleAttributeSet;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.apache.html.dom.*;
|
||||
import org.w3c.dom.html.HTMLDocument;
|
||||
|
||||
public class HtmlParser extends Parser {
|
||||
|
||||
HTMLBuilder builder;
|
||||
Attributes attributes = new Attributes ();
|
||||
|
||||
public HtmlParser () throws IOException {
|
||||
super (DTD.getDTD ("html32"));
|
||||
dtd.getElement ("table");
|
||||
dtd.getElement ("tr");
|
||||
dtd.getElement ("td");
|
||||
dtd.getElement ("span");
|
||||
dtd.getElement ("div");
|
||||
dtd.getElement ("font");
|
||||
dtd.getElement ("b");
|
||||
dtd.getElement ("i");
|
||||
dtd.getElement ("a");
|
||||
dtd.getElement ("blockquote");
|
||||
dtd.getElement ("em");
|
||||
dtd.getElement ("ul");
|
||||
dtd.getElement ("ol");
|
||||
dtd.getElement ("li");
|
||||
dtd.elementHash.remove ("meta");
|
||||
dtd.elementHash.remove ("link");
|
||||
dtd.elementHash.remove ("base");
|
||||
builder = new HTMLBuilder ();
|
||||
try {
|
||||
builder.startDocument ();
|
||||
} catch (SAXException x) {
|
||||
System.err.println ("Error in constructor");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle Start Tag.
|
||||
*/
|
||||
protected void handleStartTag(TagElement tag) {
|
||||
// System.err.println ("handleStartTag ("+tag.getHTMLTag()+")");
|
||||
attributes.convert (getAttributes());
|
||||
flushAttributes();
|
||||
try {
|
||||
builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes);
|
||||
} catch (SAXException x) {
|
||||
System.err.println ("Error in handleStartTag");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle Empty Tag.
|
||||
*/
|
||||
protected void handleEmptyTag(TagElement tag) {
|
||||
// System.err.println ("handleEmptyTag ("+tag.getHTMLTag()+")");
|
||||
attributes.convert (getAttributes());
|
||||
flushAttributes();
|
||||
try {
|
||||
builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes);
|
||||
builder.endElement (tag.getHTMLTag().toString().toUpperCase());
|
||||
} catch (SAXException x) {
|
||||
System.err.println ("Error in handleEmptyTag: "+x);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle End Tag.
|
||||
*/
|
||||
protected void handleEndTag(TagElement tag) {
|
||||
// System.err.println ("handleEndTag ("+tag.getHTMLTag()+")");
|
||||
try {
|
||||
builder.endElement (tag.getHTMLTag().toString().toUpperCase());
|
||||
} catch (SAXException x) {
|
||||
System.err.println ("Error in handleEndTag: "+x);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle Text.
|
||||
*/
|
||||
protected void handleText(char data[]) {
|
||||
// System.err.println ("handleText ("+new String (data)+")");
|
||||
try {
|
||||
builder.characters (data, 0, data.length);
|
||||
} catch (SAXException x) {
|
||||
System.err.println ("Error in handleText");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Error handling.
|
||||
*/
|
||||
protected void handleError(int ln, String errorMsg) {
|
||||
// System.err.println ("handleError ("+ln+": "+errorMsg+")");
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle comment.
|
||||
*/
|
||||
protected void handleComment(char text[]) {
|
||||
}
|
||||
|
||||
public HTMLDocument getDocument () {
|
||||
try {
|
||||
builder.endDocument ();
|
||||
} catch (SAXException x) {}
|
||||
return builder.getHTMLDocument ();
|
||||
}
|
||||
|
||||
|
||||
class Attributes implements org.xml.sax.AttributeList {
|
||||
HashMap map = new HashMap();
|
||||
ArrayList names = new ArrayList();
|
||||
ArrayList values = new ArrayList ();
|
||||
|
||||
public int getLength() {
|
||||
return names.size();
|
||||
}
|
||||
|
||||
public String getName (int i) {
|
||||
return (String) names.get (i);
|
||||
}
|
||||
|
||||
public String getType (int i) {
|
||||
return "CDATA";
|
||||
}
|
||||
|
||||
public String getType (String name) {
|
||||
return "CDATA";
|
||||
}
|
||||
|
||||
public String getValue (int i) {
|
||||
return (String) values.get (i);
|
||||
}
|
||||
|
||||
public String getValue (String name) {
|
||||
return (String) map.get (name);
|
||||
}
|
||||
|
||||
public void convert (SimpleAttributeSet attset) {
|
||||
map.clear ();
|
||||
names.clear ();
|
||||
values.clear ();
|
||||
for (Enumeration e = attset.getAttributeNames(); e.hasMoreElements(); ) {
|
||||
Object name = e.nextElement ();
|
||||
Object value = attset.getAttribute (name).toString();
|
||||
map.put (name.toString(), value);
|
||||
names.add (name.toString());
|
||||
values.add (value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
78
src/helma/util/XmlUtils.java
Normal file
78
src/helma/util/XmlUtils.java
Normal file
|
@ -0,0 +1,78 @@
|
|||
package helma.util;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URL;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import org.w3c.dom.Document;
|
||||
import org.xml.sax.Parser;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
|
||||
public class XmlUtils {
|
||||
|
||||
private static DocumentBuilderFactory domBuilderFactory = null;
|
||||
private static SAXParserFactory saxParserFactory = null;
|
||||
|
||||
public static Document parseXml (Object obj)
|
||||
throws SAXException, IOException, ParserConfigurationException {
|
||||
if (domBuilderFactory == null)
|
||||
domBuilderFactory = javax.xml.parsers.DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder parser = domBuilderFactory.newDocumentBuilder();
|
||||
Document doc = null;
|
||||
if (obj instanceof String) try {
|
||||
// first try to interpret string as URL
|
||||
URL url = new URL (obj.toString ());
|
||||
doc = parser.parse (obj.toString());
|
||||
} catch (MalformedURLException nourl) {
|
||||
// if not a URL, maybe it is the XML itself
|
||||
doc = parser.parse (new InputSource (new StringReader (obj.toString())));
|
||||
} else if (obj instanceof InputStream) {
|
||||
doc = parser.parse (new InputSource ((InputStream) obj));
|
||||
} else if (obj instanceof Reader) {
|
||||
doc = parser.parse (new InputSource ((Reader) obj));
|
||||
}
|
||||
doc.normalize();
|
||||
return doc;
|
||||
}
|
||||
|
||||
|
||||
public static Document parseHtml (Object obj)
|
||||
throws SAXException, IOException, ParserConfigurationException {
|
||||
try {
|
||||
Class.forName ("org.apache.html.dom.HTMLBuilder");
|
||||
} catch (Throwable notfound) {
|
||||
throw new IOException ("Couldn't load nekohtml/Xerces HTML parser: "+notfound);
|
||||
}
|
||||
Document doc = null;
|
||||
HtmlParser parser = new HtmlParser ();
|
||||
if (obj instanceof String) try {
|
||||
// first try to interpret string as URL
|
||||
URL url = new URL (obj.toString ());
|
||||
parser.parse (new InputStreamReader (url.openStream()));
|
||||
} catch (MalformedURLException nourl) {
|
||||
// if not a URL, maybe it is the XML itself
|
||||
parser.parse (new StringReader (obj.toString()));
|
||||
} else if (obj instanceof InputStream) {
|
||||
parser.parse (new InputStreamReader ((InputStream) obj));
|
||||
} else if (obj instanceof Reader) {
|
||||
parser.parse ((Reader) obj);
|
||||
}
|
||||
doc = parser.getDocument ();
|
||||
return doc;
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
Add table
Reference in a new issue