diff --git a/src/helma/scripting/fesi/HopExtension.java b/src/helma/scripting/fesi/HopExtension.java index 30134a2d..45460134 100644 --- a/src/helma/scripting/fesi/HopExtension.java +++ b/src/helma/scripting/fesi/HopExtension.java @@ -654,24 +654,9 @@ public class HopExtension { } public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException { try { - // Class.forName ("org.apache.xerces.parsers.DOMParser"); - // org.apache.xerces.parsers.DOMParser parser = new org.apache.xerces.parsers.DOMParser(); - Class.forName ("org.openxml.parser.XMLParser"); - org.openxml.parser.XMLParser parser = new org.openxml.parser.XMLParser(); Object p = arguments[0].toJavaObject (); - if (p instanceof String) try { - // first try to interpret string as URL - java.net.URL u = new java.net.URL (p.toString ()); - parser.parse (p.toString()); - } catch (java.net.MalformedURLException nourl) { - // if not a URL, maybe it is the XML itself - parser.parse (new InputSource (new StringReader (p.toString()))); - } - else if (p instanceof InputStream) - parser.parse (new InputSource ((InputStream) p)); - else if (p instanceof Reader) - parser.parse (new InputSource ((Reader) p)); - return ESLoader.normalizeObject (parser.getDocument(), evaluator); + Object doc = helma.util.XmlUtils.parseXml (p); + return ESLoader.normalizeObject (doc, evaluator); } catch (Exception noluck) { app.logEvent ("Error creating XML document: "+noluck); } @@ -685,22 +670,9 @@ public class HopExtension { } public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException { try { - Class.forName ("org.openxml.parser.HTMLParser"); - org.openxml.parser.HTMLParser parser = new org.openxml.parser.HTMLParser(); Object p = arguments[0].toJavaObject (); - if (p instanceof String) try { - // first try to interpret string as URL - java.net.URL u = new java.net.URL (p.toString ()); - parser.parse (p.toString()); - } catch (java.net.MalformedURLException nourl) { - // if not a URL, maybe it is the HTML itself - parser.parse (new InputSource (new StringReader (p.toString()))); - } - else if (p instanceof InputStream) - parser.parse (new InputSource ((InputStream) p)); - else if (p instanceof Reader) - parser.parse (new InputSource ((Reader) p)); - return ESLoader.normalizeObject (parser.getDocument(), evaluator); + Object doc = helma.util.XmlUtils.parseHtml (p); + return ESLoader.normalizeObject (doc, evaluator); } catch (Exception noluck) { app.logEvent ("Error creating HTML document: "+noluck); } @@ -714,13 +686,13 @@ public class HopExtension { } public ESValue callFunction (ESObject thisObject, ESValue[] arguments) throws EcmaScriptException { try { - Class.forName ("org.w3c.dom.Document"); + Class.forName ("org.w3c.dom.Document"); org.w3c.dom.Document doc = (org.w3c.dom.Document) arguments[0].toJavaObject (); Class.forName ("org.jdom.input.DOMBuilder"); - org.jdom.input.DOMBuilder builder = new org.jdom.input.DOMBuilder (); + org.jdom.input.DOMBuilder builder = new org.jdom.input.DOMBuilder (); return ESLoader.normalizeObject (builder.build (doc), evaluator); } catch (Exception noluck) { - app.logEvent ("Error wrapping JDOM document: "+noluck); + app.logEvent ("Error building JDOM document: "+noluck); } return ESNull.theNull; } diff --git a/src/helma/util/HtmlParser.java b/src/helma/util/HtmlParser.java new file mode 100644 index 00000000..2341aa5f --- /dev/null +++ b/src/helma/util/HtmlParser.java @@ -0,0 +1,166 @@ +// HtmlParser.java +// Copyright (c) Hannes Wallnöfer 2002 + +package helma.util; + +import java.util.HashMap; +import java.util.ArrayList; +import java.util.Enumeration; +import java.io.IOException; +import javax.swing.text.html.parser.*; +import javax.swing.text.SimpleAttributeSet; +import org.xml.sax.SAXException; +import org.apache.html.dom.*; +import org.w3c.dom.html.HTMLDocument; + +public class HtmlParser extends Parser { + + HTMLBuilder builder; + Attributes attributes = new Attributes (); + + public HtmlParser () throws IOException { + super (DTD.getDTD ("html32")); + dtd.getElement ("table"); + dtd.getElement ("tr"); + dtd.getElement ("td"); + dtd.getElement ("span"); + dtd.getElement ("div"); + dtd.getElement ("font"); + dtd.getElement ("b"); + dtd.getElement ("i"); + dtd.getElement ("a"); + dtd.getElement ("blockquote"); + dtd.getElement ("em"); + dtd.getElement ("ul"); + dtd.getElement ("ol"); + dtd.getElement ("li"); + dtd.elementHash.remove ("meta"); + dtd.elementHash.remove ("link"); + dtd.elementHash.remove ("base"); + builder = new HTMLBuilder (); + try { + builder.startDocument (); + } catch (SAXException x) { + System.err.println ("Error in constructor"); + } + } + + /** + * Handle Start Tag. + */ + protected void handleStartTag(TagElement tag) { + // System.err.println ("handleStartTag ("+tag.getHTMLTag()+")"); + attributes.convert (getAttributes()); + flushAttributes(); + try { + builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes); + } catch (SAXException x) { + System.err.println ("Error in handleStartTag"); + } + } + + /** + * Handle Empty Tag. + */ + protected void handleEmptyTag(TagElement tag) { + // System.err.println ("handleEmptyTag ("+tag.getHTMLTag()+")"); + attributes.convert (getAttributes()); + flushAttributes(); + try { + builder.startElement (tag.getHTMLTag().toString().toUpperCase(), attributes); + builder.endElement (tag.getHTMLTag().toString().toUpperCase()); + } catch (SAXException x) { + System.err.println ("Error in handleEmptyTag: "+x); + } + } + + /** + * Handle End Tag. + */ + protected void handleEndTag(TagElement tag) { + // System.err.println ("handleEndTag ("+tag.getHTMLTag()+")"); + try { + builder.endElement (tag.getHTMLTag().toString().toUpperCase()); + } catch (SAXException x) { + System.err.println ("Error in handleEndTag: "+x); + } + } + + /** + * Handle Text. + */ + protected void handleText(char data[]) { + // System.err.println ("handleText ("+new String (data)+")"); + try { + builder.characters (data, 0, data.length); + } catch (SAXException x) { + System.err.println ("Error in handleText"); + } + } + + /* + * Error handling. + */ + protected void handleError(int ln, String errorMsg) { + // System.err.println ("handleError ("+ln+": "+errorMsg+")"); + } + + /** + * Handle comment. + */ + protected void handleComment(char text[]) { + } + + public HTMLDocument getDocument () { + try { + builder.endDocument (); + } catch (SAXException x) {} + return builder.getHTMLDocument (); + } + + + class Attributes implements org.xml.sax.AttributeList { + HashMap map = new HashMap(); + ArrayList names = new ArrayList(); + ArrayList values = new ArrayList (); + + public int getLength() { + return names.size(); + } + + public String getName (int i) { + return (String) names.get (i); + } + + public String getType (int i) { + return "CDATA"; + } + + public String getType (String name) { + return "CDATA"; + } + + public String getValue (int i) { + return (String) values.get (i); + } + + public String getValue (String name) { + return (String) map.get (name); + } + + public void convert (SimpleAttributeSet attset) { + map.clear (); + names.clear (); + values.clear (); + for (Enumeration e = attset.getAttributeNames(); e.hasMoreElements(); ) { + Object name = e.nextElement (); + Object value = attset.getAttribute (name).toString(); + map.put (name.toString(), value); + names.add (name.toString()); + values.add (value); + } + } + + } +} + diff --git a/src/helma/util/XmlUtils.java b/src/helma/util/XmlUtils.java new file mode 100644 index 00000000..918ebcb1 --- /dev/null +++ b/src/helma/util/XmlUtils.java @@ -0,0 +1,78 @@ +package helma.util; + +import java.io.InputStream; +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.io.InputStreamReader; +import java.net.URL; +import java.net.MalformedURLException; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.SAXParserFactory; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.Document; +import org.xml.sax.Parser; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + + +public class XmlUtils { + + private static DocumentBuilderFactory domBuilderFactory = null; + private static SAXParserFactory saxParserFactory = null; + + public static Document parseXml (Object obj) + throws SAXException, IOException, ParserConfigurationException { + if (domBuilderFactory == null) + domBuilderFactory = javax.xml.parsers.DocumentBuilderFactory.newInstance(); + DocumentBuilder parser = domBuilderFactory.newDocumentBuilder(); + Document doc = null; + if (obj instanceof String) try { + // first try to interpret string as URL + URL url = new URL (obj.toString ()); + doc = parser.parse (obj.toString()); + } catch (MalformedURLException nourl) { + // if not a URL, maybe it is the XML itself + doc = parser.parse (new InputSource (new StringReader (obj.toString()))); + } else if (obj instanceof InputStream) { + doc = parser.parse (new InputSource ((InputStream) obj)); + } else if (obj instanceof Reader) { + doc = parser.parse (new InputSource ((Reader) obj)); + } + doc.normalize(); + return doc; + } + + + public static Document parseHtml (Object obj) + throws SAXException, IOException, ParserConfigurationException { + try { + Class.forName ("org.apache.html.dom.HTMLBuilder"); + } catch (Throwable notfound) { + throw new IOException ("Couldn't load nekohtml/Xerces HTML parser: "+notfound); + } + Document doc = null; + HtmlParser parser = new HtmlParser (); + if (obj instanceof String) try { + // first try to interpret string as URL + URL url = new URL (obj.toString ()); + parser.parse (new InputStreamReader (url.openStream())); + } catch (MalformedURLException nourl) { + // if not a URL, maybe it is the XML itself + parser.parse (new StringReader (obj.toString())); + } else if (obj instanceof InputStream) { + parser.parse (new InputStreamReader ((InputStream) obj)); + } else if (obj instanceof Reader) { + parser.parse ((Reader) obj); + } + doc = parser.getDocument (); + return doc; + } + + +}