* Drop hacked-together HtmlParser class, move to Tagsoup for HTML parsing.

* Move remaining HTML parsing code into XmlUtils. * Clean up XmlUtils code.
2007-02-21 14:26:36 +00:00 · 2007-02-21 14:26:36 +00:00 · 8a85d6d5cf
commit 8a85d6d5cf
parent 0667f13290
2 changed files with 27 additions and 325 deletions
--- a/src/helma/util/HtmlParser.java
+++ b/src/helma/util/HtmlParser.java
@ -1,307 +0,0 @@
 /*
 * Helma License Notice
 *
 * The contents of this file are subject to the Helma License
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. A copy of the License is available at
 * http://adele.helma.org/download/helma/license.txt
 *
 * Copyright 1998-2003 Helma Software. All Rights Reserved.
 *
 * $RCSfile$
 * $Author$
 * $Revision$
 * $Date$
 */
 package helma.util;
 import org.apache.html.dom.*;
 import org.w3c.dom.html.HTMLDocument;
 import org.xml.sax.SAXException;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.EmptyStackException;
 import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Stack;
 import javax.swing.text.SimpleAttributeSet;
 import javax.swing.text.html.parser.*;
 /**
 * 
 */
 public class HtmlParser extends Parser {
    static final HashSet stopNone = new HashSet();
    static final HashSet stopTable = new HashSet();
    static final HashSet stopList = new HashSet();
    static final HashSet stopDeflist = new HashSet();
    static {
        stopTable.add("TABLE");
        stopList.add("TABLE");
        stopList.add("UL");
        stopList.add("OL");
        stopDeflist.add("TABLE");
        stopDeflist.add("DL");
    }
    HTMLBuilder builder;
    Attributes attributes = new Attributes();
    Stack stack = new Stack();
    /**
     * Creates a new HtmlParser object.
     *
     * @throws IOException ...
     */
    public HtmlParser() throws IOException {
        super(DTD.getDTD("html32"));
        // define elements to be treated as container tags, and undefine those
        // to be treated as empty tags.
        dtd.getElement("table");
        dtd.getElement("tr");
        dtd.getElement("td");
        dtd.getElement("span");
        dtd.getElement("div");
        dtd.getElement("font");
        dtd.getElement("b");
        dtd.getElement("i");
        dtd.getElement("a");
        dtd.getElement("blockquote");
        dtd.getElement("em");
        dtd.getElement("ul");
        dtd.getElement("ol");
        dtd.getElement("li");
        dtd.getElement("dl");
        dtd.getElement("dt");
        dtd.getElement("dd");
        dtd.getElement("h1");
        dtd.getElement("h2");
        dtd.getElement("h3");
        dtd.getElement("h4");
        dtd.getElement("h5");
        dtd.getElement("h6");
        dtd.getElement("form");
        dtd.getElement("option");
        dtd.elementHash.remove("meta");
        dtd.elementHash.remove("link");
        dtd.elementHash.remove("base");
        builder = new HTMLBuilder();
        try {
            builder.startDocument();
        } catch (SAXException x) {
            System.err.println("Error in constructor");
        }
    }
    /**
     * Handle Start Tag.
     */
    protected void handleStartTag(TagElement tag) {
        // System.err.println ("handleStartTag ("+tag.getHTMLTag()+")");
        attributes.convert(getAttributes());
        flushAttributes();
        String tagname = tag.getHTMLTag().toString().toUpperCase();
        // immediately empty A anchor tag
        if ("A".equals(tagname) && (attributes.getValue("href") == null)) {
            try {
                builder.startElement(tagname, attributes);
                builder.endElement(tagname);
                return;
            } catch (SAXException x) {
            }
        }
        if ("TD".equals(tagname)) {
            closeOpenTags("TD", stopTable, 10);
        } else if ("TR".equals(tagname)) {
            closeOpenTags("TR", stopTable, 10);
        } else if ("LI".equals(tagname)) {
            closeOpenTags("LI", stopList, 6);
        } else if ("DT".equals(tagname) || "DD".equals(tagname)) {
            closeOpenTags("DT", stopDeflist, 6);
            closeOpenTags("DL", stopDeflist, 6);
        } else if ("OPTION".equals(tagname)) {
            closeOpenTags("OPTION", stopNone, 1);
        } else if ("P".equals(tagname)) {
            closeOpenTags("P", stopNone, 1);
        }
        stack.push(tagname);
        try {
            builder.startElement(tagname, attributes);
        } catch (SAXException x) {
            System.err.println("Error in handleStartTag");
        }
    }
    /**
     * Handle End Tag.
     */
    protected void handleEndTag(TagElement tag) {
        // System.err.println ("handleEndTag ("+tag.getHTMLTag()+")");
        String tagname = tag.getHTMLTag().toString().toUpperCase();
        try {
            if (tagname.equals(stack.peek())) {
                stack.pop();
            }
        } catch (EmptyStackException es) {
        }
        try {
            builder.endElement(tagname);
        } catch (SAXException x) {
            System.err.println("Error in handleEndTag: " + x);
        }
    }
    /**
     * Handle Empty Tag.
     */
    protected void handleEmptyTag(TagElement tag) {
        // System.err.println ("handleEmptyTag ("+tag.getHTMLTag()+")");
        attributes.convert(getAttributes());
        flushAttributes();
        String tagname = tag.getHTMLTag().toString().toUpperCase();
        try {
            builder.startElement(tagname, attributes);
            builder.endElement(tagname);
        } catch (SAXException x) {
            System.err.println("Error in handleEmptyTag: " + x);
        }
    }
    /**
     * Handle Text.
     */
    protected void handleText(char[] data) {
        // System.err.println ("handleText ("+new String (data)+")");
        try {
            builder.characters(data, 0, data.length);
        } catch (SAXException x) {
            System.err.println("Error in handleText");
        }
    }
    /*
     * Error handling.
     */
    protected void handleError(int ln, String errorMsg) {
        // System.err.println ("handleError ("+ln+": "+errorMsg+")");
    }
    /**
     *  Handle comment.
     */
    protected void handleComment(char[] data) {
        // System.err.println ("handleComment ("+new String (data)+")");
        /* try {
           builder.characters (data, 0, data.length);
           } catch (SAXException x) {
               System.err.println ("Error in handleComment");
           }*/
    }
    /**
     *
     *
     * @return ...
     */
    public HTMLDocument getDocument() {
        try {
            builder.endDocument();
        } catch (SAXException x) {
        }
        return builder.getHTMLDocument();
    }
    private void closeOpenTags(String until, HashSet stoppers, int maxdepth) {
        int l = stack.size();
        int stop = Math.max(0, l - maxdepth);
        int found = -1;
        for (int i = l - 1; i >= stop; i--) {
            Object o = stack.elementAt(i);
            if (stoppers.contains(o)) {
                return;
            }
            if (until.equals(o)) {
                found = i;
                break;
            }
        }
        if (found > -1) {
            for (int i = l - 1; i >= found; i--) {
                try {
                    String t = (String) stack.pop();
                    builder.endElement(t);
                } catch (Exception x) {
                }
            }
        }
    }
    class Attributes implements org.xml.sax.AttributeList {
        HashMap map = new HashMap();
        ArrayList names = new ArrayList();
        ArrayList values = new ArrayList();
        public int getLength() {
            return names.size();
        }
        public String getName(int i) {
            return (String) names.get(i);
        }
        public String getType(int i) {
            return "CDATA";
        }
        public String getType(String name) {
            return "CDATA";
        }
        public String getValue(int i) {
            return (String) values.get(i);
        }
        public String getValue(String name) {
            return (String) map.get(name);
        }
        public void convert(SimpleAttributeSet attset) {
            map.clear();
            names.clear();
            values.clear();
            for (Enumeration e = attset.getAttributeNames(); e.hasMoreElements();) {
                Object name = e.nextElement();
                Object value = attset.getAttribute(name).toString();
                name = name.toString().toLowerCase();
                map.put(name, value);
                names.add(name);
                values.add(value);
            }
        }
    }
 }
--- a/src/helma/util/XmlUtils.java
+++ b/src/helma/util/XmlUtils.java
@ -17,8 +17,13 @@
 package helma.util;
 import org.w3c.dom.Document;
 import org.w3c.dom.html.HTMLDocument;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.XMLReaderAdapter;
 import org.ccil.cowan.tagsoup.Parser;
 import org.apache.html.dom.HTMLBuilder;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@ -55,7 +60,7 @@ public class XmlUtils {
        }
        DocumentBuilder parser = domBuilderFactory.newDocumentBuilder();
-        Document doc = null;
+        Document doc;
        if (obj instanceof String) {
            try {
@ -71,10 +76,11 @@ public class XmlUtils {
            doc = parser.parse(new InputSource((InputStream) obj));
        } else if (obj instanceof Reader) {
            doc = parser.parse(new InputSource((Reader) obj));
        } else {
            throw new RuntimeException("Unrecognized argument to parseXml: " + obj);
        }
        doc.normalize();
        return doc;
    }
@ -87,36 +93,39 @@ public class XmlUtils {
     *
     * @throws IOException ...
     */
-    public static Document parseHtml(Object obj)
+    public static HTMLDocument parseHtml(Object obj)
-                              throws IOException {
+                              throws IOException, SAXException {
        try {
-            Class.forName("org.apache.html.dom.HTMLBuilder");
+            Class.forName("org.apache.html.dom.HTMLDocumentImpl");
        } catch (Throwable notfound) {
-            throw new IOException("Couldn't load nekohtml/Xerces HTML parser: " +
+            throw new RuntimeException("Couldn't load Xerces HTML DOM classes. " +
-                                  notfound);
+                "Make sure you have xercesImpl.jar and xml-apis.jar in your classpath.");
        }
        Document doc = null;
        HtmlParser parser = new HtmlParser();
        if (obj instanceof String) {
            try {
                // first try to interpret string as URL
                URL url = new URL(obj.toString());
-
+                return getHtmlDocument(new InputStreamReader(url.openStream()));
                parser.parse(new InputStreamReader(url.openStream()));
            } catch (MalformedURLException nourl) {
                // if not a URL, maybe it is the XML itself
-                parser.parse(new StringReader(obj.toString()));
+                return getHtmlDocument(new StringReader(obj.toString()));
            }
        } else if (obj instanceof InputStream) {
-            parser.parse(new InputStreamReader((InputStream) obj));
+            return getHtmlDocument(new InputStreamReader((InputStream) obj));
        } else if (obj instanceof Reader) {
-            parser.parse((Reader) obj);
+            return getHtmlDocument((Reader) obj);
        } else {
            throw new RuntimeException("Unrecognized argument to parseHtml: " + obj);
        }
    }
-        doc = parser.getDocument();
+    private static HTMLDocument getHtmlDocument(Reader reader)
-
+            throws IOException, SAXException {
-        return doc;
+        XMLReaderAdapter parser = new XMLReaderAdapter(new Parser());
        HTMLBuilder builder = new HTMLBuilder();
        parser.setDocumentHandler(builder);
        parser.parse(new InputSource(reader));
        return builder.getHTMLDocument();
    }
 }