diff --git a/src/helma/util/HtmlParser.java b/src/helma/util/HtmlParser.java deleted file mode 100644 index eeb58e13..00000000 --- a/src/helma/util/HtmlParser.java +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Helma License Notice - * - * The contents of this file are subject to the Helma License - * Version 2.0 (the "License"). You may not use this file except in - * compliance with the License. A copy of the License is available at - * http://adele.helma.org/download/helma/license.txt - * - * Copyright 1998-2003 Helma Software. All Rights Reserved. - * - * $RCSfile$ - * $Author$ - * $Revision$ - * $Date$ - */ - -package helma.util; - -import org.apache.html.dom.*; -import org.w3c.dom.html.HTMLDocument; -import org.xml.sax.SAXException; -import java.io.IOException; -import java.util.ArrayList; -import java.util.EmptyStackException; -import java.util.Enumeration; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Stack; -import javax.swing.text.SimpleAttributeSet; -import javax.swing.text.html.parser.*; - -/** - * - */ -public class HtmlParser extends Parser { - static final HashSet stopNone = new HashSet(); - static final HashSet stopTable = new HashSet(); - static final HashSet stopList = new HashSet(); - static final HashSet stopDeflist = new HashSet(); - - static { - stopTable.add("TABLE"); - stopList.add("TABLE"); - stopList.add("UL"); - stopList.add("OL"); - stopDeflist.add("TABLE"); - stopDeflist.add("DL"); - } - - HTMLBuilder builder; - Attributes attributes = new Attributes(); - Stack stack = new Stack(); - - /** - * Creates a new HtmlParser object. - * - * @throws IOException ... - */ - public HtmlParser() throws IOException { - super(DTD.getDTD("html32")); - - // define elements to be treated as container tags, and undefine those - // to be treated as empty tags. - dtd.getElement("table"); - dtd.getElement("tr"); - dtd.getElement("td"); - dtd.getElement("span"); - dtd.getElement("div"); - dtd.getElement("font"); - dtd.getElement("b"); - dtd.getElement("i"); - dtd.getElement("a"); - dtd.getElement("blockquote"); - dtd.getElement("em"); - dtd.getElement("ul"); - dtd.getElement("ol"); - dtd.getElement("li"); - dtd.getElement("dl"); - dtd.getElement("dt"); - dtd.getElement("dd"); - dtd.getElement("h1"); - dtd.getElement("h2"); - dtd.getElement("h3"); - dtd.getElement("h4"); - dtd.getElement("h5"); - dtd.getElement("h6"); - dtd.getElement("form"); - dtd.getElement("option"); - dtd.elementHash.remove("meta"); - dtd.elementHash.remove("link"); - dtd.elementHash.remove("base"); - builder = new HTMLBuilder(); - - try { - builder.startDocument(); - } catch (SAXException x) { - System.err.println("Error in constructor"); - } - } - - /** - * Handle Start Tag. - */ - protected void handleStartTag(TagElement tag) { - // System.err.println ("handleStartTag ("+tag.getHTMLTag()+")"); - attributes.convert(getAttributes()); - flushAttributes(); - - String tagname = tag.getHTMLTag().toString().toUpperCase(); - - // immediately empty A anchor tag - if ("A".equals(tagname) && (attributes.getValue("href") == null)) { - try { - builder.startElement(tagname, attributes); - builder.endElement(tagname); - - return; - } catch (SAXException x) { - } - } - - if ("TD".equals(tagname)) { - closeOpenTags("TD", stopTable, 10); - } else if ("TR".equals(tagname)) { - closeOpenTags("TR", stopTable, 10); - } else if ("LI".equals(tagname)) { - closeOpenTags("LI", stopList, 6); - } else if ("DT".equals(tagname) || "DD".equals(tagname)) { - closeOpenTags("DT", stopDeflist, 6); - closeOpenTags("DL", stopDeflist, 6); - } else if ("OPTION".equals(tagname)) { - closeOpenTags("OPTION", stopNone, 1); - } else if ("P".equals(tagname)) { - closeOpenTags("P", stopNone, 1); - } - - stack.push(tagname); - - try { - builder.startElement(tagname, attributes); - } catch (SAXException x) { - System.err.println("Error in handleStartTag"); - } - } - - /** - * Handle End Tag. - */ - protected void handleEndTag(TagElement tag) { - // System.err.println ("handleEndTag ("+tag.getHTMLTag()+")"); - String tagname = tag.getHTMLTag().toString().toUpperCase(); - - try { - if (tagname.equals(stack.peek())) { - stack.pop(); - } - } catch (EmptyStackException es) { - } - - try { - builder.endElement(tagname); - } catch (SAXException x) { - System.err.println("Error in handleEndTag: " + x); - } - } - - /** - * Handle Empty Tag. - */ - protected void handleEmptyTag(TagElement tag) { - // System.err.println ("handleEmptyTag ("+tag.getHTMLTag()+")"); - attributes.convert(getAttributes()); - flushAttributes(); - - String tagname = tag.getHTMLTag().toString().toUpperCase(); - - try { - builder.startElement(tagname, attributes); - builder.endElement(tagname); - } catch (SAXException x) { - System.err.println("Error in handleEmptyTag: " + x); - } - } - - /** - * Handle Text. - */ - protected void handleText(char[] data) { - // System.err.println ("handleText ("+new String (data)+")"); - try { - builder.characters(data, 0, data.length); - } catch (SAXException x) { - System.err.println("Error in handleText"); - } - } - - /* - * Error handling. - */ - protected void handleError(int ln, String errorMsg) { - // System.err.println ("handleError ("+ln+": "+errorMsg+")"); - } - - /** - * Handle comment. - */ - protected void handleComment(char[] data) { - // System.err.println ("handleComment ("+new String (data)+")"); - - /* try { - builder.characters (data, 0, data.length); - } catch (SAXException x) { - System.err.println ("Error in handleComment"); - }*/ - } - - /** - * - * - * @return ... - */ - public HTMLDocument getDocument() { - try { - builder.endDocument(); - } catch (SAXException x) { - } - - return builder.getHTMLDocument(); - } - - private void closeOpenTags(String until, HashSet stoppers, int maxdepth) { - int l = stack.size(); - int stop = Math.max(0, l - maxdepth); - int found = -1; - - for (int i = l - 1; i >= stop; i--) { - Object o = stack.elementAt(i); - - if (stoppers.contains(o)) { - return; - } - - if (until.equals(o)) { - found = i; - - break; - } - } - - if (found > -1) { - for (int i = l - 1; i >= found; i--) { - try { - String t = (String) stack.pop(); - - builder.endElement(t); - } catch (Exception x) { - } - } - } - } - - class Attributes implements org.xml.sax.AttributeList { - HashMap map = new HashMap(); - ArrayList names = new ArrayList(); - ArrayList values = new ArrayList(); - - public int getLength() { - return names.size(); - } - - public String getName(int i) { - return (String) names.get(i); - } - - public String getType(int i) { - return "CDATA"; - } - - public String getType(String name) { - return "CDATA"; - } - - public String getValue(int i) { - return (String) values.get(i); - } - - public String getValue(String name) { - return (String) map.get(name); - } - - public void convert(SimpleAttributeSet attset) { - map.clear(); - names.clear(); - values.clear(); - - for (Enumeration e = attset.getAttributeNames(); e.hasMoreElements();) { - Object name = e.nextElement(); - Object value = attset.getAttribute(name).toString(); - - name = name.toString().toLowerCase(); - map.put(name, value); - names.add(name); - values.add(value); - } - } - } -} diff --git a/src/helma/util/XmlUtils.java b/src/helma/util/XmlUtils.java index 055d5dfa..da85e62e 100644 --- a/src/helma/util/XmlUtils.java +++ b/src/helma/util/XmlUtils.java @@ -17,8 +17,13 @@ package helma.util; import org.w3c.dom.Document; +import org.w3c.dom.html.HTMLDocument; import org.xml.sax.InputSource; import org.xml.sax.SAXException; +import org.xml.sax.helpers.XMLReaderAdapter; +import org.ccil.cowan.tagsoup.Parser; +import org.apache.html.dom.HTMLBuilder; + import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; @@ -55,7 +60,7 @@ public class XmlUtils { } DocumentBuilder parser = domBuilderFactory.newDocumentBuilder(); - Document doc = null; + Document doc; if (obj instanceof String) { try { @@ -71,10 +76,11 @@ public class XmlUtils { doc = parser.parse(new InputSource((InputStream) obj)); } else if (obj instanceof Reader) { doc = parser.parse(new InputSource((Reader) obj)); + } else { + throw new RuntimeException("Unrecognized argument to parseXml: " + obj); } doc.normalize(); - return doc; } @@ -87,36 +93,39 @@ public class XmlUtils { * * @throws IOException ... */ - public static Document parseHtml(Object obj) - throws IOException { + public static HTMLDocument parseHtml(Object obj) + throws IOException, SAXException { try { - Class.forName("org.apache.html.dom.HTMLBuilder"); + Class.forName("org.apache.html.dom.HTMLDocumentImpl"); } catch (Throwable notfound) { - throw new IOException("Couldn't load nekohtml/Xerces HTML parser: " + - notfound); + throw new RuntimeException("Couldn't load Xerces HTML DOM classes. " + + "Make sure you have xercesImpl.jar and xml-apis.jar in your classpath."); } - Document doc = null; - HtmlParser parser = new HtmlParser(); - if (obj instanceof String) { try { // first try to interpret string as URL URL url = new URL(obj.toString()); - - parser.parse(new InputStreamReader(url.openStream())); + return getHtmlDocument(new InputStreamReader(url.openStream())); } catch (MalformedURLException nourl) { // if not a URL, maybe it is the XML itself - parser.parse(new StringReader(obj.toString())); + return getHtmlDocument(new StringReader(obj.toString())); } } else if (obj instanceof InputStream) { - parser.parse(new InputStreamReader((InputStream) obj)); + return getHtmlDocument(new InputStreamReader((InputStream) obj)); } else if (obj instanceof Reader) { - parser.parse((Reader) obj); + return getHtmlDocument((Reader) obj); + } else { + throw new RuntimeException("Unrecognized argument to parseHtml: " + obj); } + } - doc = parser.getDocument(); - - return doc; + private static HTMLDocument getHtmlDocument(Reader reader) + throws IOException, SAXException { + XMLReaderAdapter parser = new XMLReaderAdapter(new Parser()); + HTMLBuilder builder = new HTMLBuilder(); + parser.setDocumentHandler(builder); + parser.parse(new InputSource(reader)); + return builder.getHTMLDocument(); } }