mirror of git://gcc.gnu.org/git/gcc.git
				
				
				
			
		
			
				
	
	
		
			265 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			Java
		
	
	
	
			
		
		
	
	
			265 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			Java
		
	
	
	
/* DomHTMLParser.java --
 | 
						|
   Copyright (C) 2005 Free Software Foundation, Inc.
 | 
						|
 | 
						|
This file is part of GNU Classpath.
 | 
						|
 | 
						|
GNU Classpath is free software; you can redistribute it and/or modify
 | 
						|
it under the terms of the GNU General Public License as published by
 | 
						|
the Free Software Foundation; either version 2, or (at your option)
 | 
						|
any later version.
 | 
						|
 | 
						|
GNU Classpath is distributed in the hope that it will be useful, but
 | 
						|
WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
General Public License for more details.
 | 
						|
 | 
						|
You should have received a copy of the GNU General Public License
 | 
						|
along with GNU Classpath; see the file COPYING.  If not, write to the
 | 
						|
Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 | 
						|
02110-1301 USA.
 | 
						|
 | 
						|
Linking this library statically or dynamically with other modules is
 | 
						|
making a combined work based on this library.  Thus, the terms and
 | 
						|
conditions of the GNU General Public License cover the whole
 | 
						|
combination.
 | 
						|
 | 
						|
As a special exception, the copyright holders of this library give you
 | 
						|
permission to link this library with independent modules to produce an
 | 
						|
executable, regardless of the license terms of these independent
 | 
						|
modules, and to copy and distribute the resulting executable under
 | 
						|
terms of your choice, provided that you also meet, for each linked
 | 
						|
independent module, the terms and conditions of the license of that
 | 
						|
module.  An independent module is a module which is not derived from
 | 
						|
or based on this library.  If you modify this library, you may extend
 | 
						|
this exception to your version of the library, but you are not
 | 
						|
obligated to do so.  If you do not wish to do so, delete this
 | 
						|
exception statement from your version. */
 | 
						|
 | 
						|
 | 
						|
package gnu.xml.dom.html2;
 | 
						|
 | 
						|
import java.io.IOException;
 | 
						|
import java.io.Reader;
 | 
						|
 | 
						|
import java.util.Enumeration;
 | 
						|
import java.util.Iterator;
 | 
						|
import java.util.LinkedList;
 | 
						|
 | 
						|
import javax.swing.text.AttributeSet;
 | 
						|
import javax.swing.text.html.HTML;
 | 
						|
import javax.swing.text.html.parser.DTD;
 | 
						|
import javax.swing.text.html.parser.TagElement;
 | 
						|
 | 
						|
import org.w3c.dom.NamedNodeMap;
 | 
						|
import org.w3c.dom.Node;
 | 
						|
import org.w3c.dom.html2.HTMLDocument;
 | 
						|
 | 
						|
/**
 | 
						|
 * This parser reads HTML from the given stream and stores into
 | 
						|
 * {@link HTMLDocument}. The HTML tag becomes the {@link Node}.
 | 
						|
 * The tag attributes become the node attributes. The text inside
 | 
						|
 * HTML tag is inserted as one or several text nodes. The nested
 | 
						|
 * HTML tags are inserted as child nodes.
 | 
						|
 *
 | 
						|
 * If the strict tree structure, closing the tag means closing all
 | 
						|
 * nested tags. To work around this, this parser closes the nested
 | 
						|
 * tags and immediately reopens them after the closed tag.
 | 
						|
 * In this way, <code><b><i>c</b>d</code>
 | 
						|
 * is parsed as <code><b><i>c</i></b><i>d</code> .
 | 
						|
 *
 | 
						|
 * @author Audrius Meskauskas (AudriusA@Bioinformatics.org)
 | 
						|
 */
 | 
						|
public class DomHTMLParser
 | 
						|
  extends gnu.javax.swing.text.html.parser.support.Parser
 | 
						|
{
 | 
						|
  /**
 | 
						|
   * The target where HTML document will be inserted.
 | 
						|
   */
 | 
						|
  protected DomHTMLDocument document;
 | 
						|
 | 
						|
  /**
 | 
						|
   * The subsequently created new nodes will be inserted as the
 | 
						|
   * childs of this cursor.
 | 
						|
   */
 | 
						|
  protected Node cursor;
 | 
						|
 | 
						|
  /**
 | 
						|
   * Create parser using the given DTD.
 | 
						|
   *
 | 
						|
   * @param dtd the DTD (for example,
 | 
						|
   * {@link gnu.javax.swing.text.html.parser.HTML_401F}).
 | 
						|
   */
 | 
						|
  public DomHTMLParser(DTD dtd)
 | 
						|
  {
 | 
						|
    super(dtd);
 | 
						|
  }
 | 
						|
 | 
						|
  /**
 | 
						|
   * Parse SGML insertion ( <! ... > ).
 | 
						|
   * Currently just treats it as comment.
 | 
						|
   */
 | 
						|
  public boolean parseMarkupDeclarations(StringBuffer strBuff)
 | 
						|
                                  throws java.io.IOException
 | 
						|
  {
 | 
						|
    Node c = document.createComment(strBuff.toString());
 | 
						|
    cursor.appendChild(c);
 | 
						|
    return false;
 | 
						|
  }
 | 
						|
 | 
						|
  /**
 | 
						|
   * Read the document, present in the given stream, and
 | 
						|
   * return the corresponding {@link HTMLDocument}.
 | 
						|
   *
 | 
						|
   * @param input a stream to read from.
 | 
						|
   * @return a document, reflecting the structure of the provided HTML
 | 
						|
   * text.
 | 
						|
   *
 | 
						|
   * @throws IOException if the reader throws one.
 | 
						|
   */
 | 
						|
  public HTMLDocument parseDocument(Reader input)
 | 
						|
                    throws IOException
 | 
						|
  {
 | 
						|
    try
 | 
						|
      {
 | 
						|
        document = new DomHTMLDocument();
 | 
						|
        document.setCheckWellformedness(false);
 | 
						|
        document.setCheckingCharacters(false);
 | 
						|
 | 
						|
        cursor = document;
 | 
						|
 | 
						|
        parse(input);
 | 
						|
 | 
						|
        DomHTMLDocument h = document;
 | 
						|
        document = null;
 | 
						|
        return h;
 | 
						|
      }
 | 
						|
    catch (Exception ex)
 | 
						|
      {
 | 
						|
        ex.printStackTrace();
 | 
						|
        throw new IOException("Exception: " + ex.getMessage());
 | 
						|
      }
 | 
						|
  }
 | 
						|
 | 
						|
  /**
 | 
						|
   * Create a new node.
 | 
						|
   * @param name the name of node, case insensitive.
 | 
						|
   * @return the created node.
 | 
						|
   */
 | 
						|
  protected Node createNode(String name)
 | 
						|
  {
 | 
						|
    Node new_node = document.createElement(name.toLowerCase());
 | 
						|
    AttributeSet hatts = getAttributes();
 | 
						|
    NamedNodeMap natts = new_node.getAttributes();
 | 
						|
 | 
						|
    Enumeration enumeration = hatts.getAttributeNames();
 | 
						|
    Object key;
 | 
						|
    Node attribute;
 | 
						|
 | 
						|
    while (hatts != null)
 | 
						|
      {
 | 
						|
        while (enumeration.hasMoreElements())
 | 
						|
          {
 | 
						|
            key = enumeration.nextElement();
 | 
						|
            attribute = document.createAttribute(key.toString());
 | 
						|
            attribute.setNodeValue(hatts.getAttribute(key).toString());
 | 
						|
            natts.setNamedItem(attribute);
 | 
						|
          }
 | 
						|
 | 
						|
        // The default values are stored in a parent node.
 | 
						|
        hatts = hatts.getResolveParent();
 | 
						|
      }
 | 
						|
 | 
						|
    return new_node;
 | 
						|
  }
 | 
						|
 | 
						|
  /**
 | 
						|
   * Handle comment by inserting the comment node.
 | 
						|
   * @param text the comment text.
 | 
						|
   */
 | 
						|
  protected void handleComment(char[] text)
 | 
						|
  {
 | 
						|
    Node c = document.createComment(new String(text));
 | 
						|
    cursor.appendChild(c);
 | 
						|
  }
 | 
						|
 | 
						|
  /**
 | 
						|
   * Handle the tag with no content.
 | 
						|
   * @param tag the tag to handle.
 | 
						|
   */
 | 
						|
  protected void handleEmptyTag(TagElement tag)
 | 
						|
  {
 | 
						|
    String name = tag.getHTMLTag().toString();
 | 
						|
 | 
						|
    if (name.equalsIgnoreCase("#pcdata"))
 | 
						|
      return;
 | 
						|
 | 
						|
    Node c = createNode(name);
 | 
						|
    cursor.appendChild(c);
 | 
						|
  }
 | 
						|
 | 
						|
  /**
 | 
						|
   * Close the given tag. Close and reopen all nested tags.
 | 
						|
   * @param tag the tag to close.
 | 
						|
   */
 | 
						|
  protected void handleEndTag(TagElement tag)
 | 
						|
  {
 | 
						|
    String name = tag.getHTMLTag().toString();
 | 
						|
    String nname = cursor.getNodeName();
 | 
						|
 | 
						|
    // Closing the current tag.
 | 
						|
    if (nname != null && nname.equalsIgnoreCase(name))
 | 
						|
      {
 | 
						|
        cursor = cursor.getParentNode();
 | 
						|
      }
 | 
						|
    else
 | 
						|
      {
 | 
						|
        Node nCursor = cursor.getParentNode();
 | 
						|
 | 
						|
        // Remember the opened nodes.
 | 
						|
        LinkedList open = new LinkedList();
 | 
						|
        Node close = cursor;
 | 
						|
        while (close != null && !close.getNodeName().equalsIgnoreCase(name))
 | 
						|
          {
 | 
						|
            if (close != document)
 | 
						|
              open.addFirst(close);
 | 
						|
            close = close.getParentNode();
 | 
						|
          }
 | 
						|
        if (close == null)
 | 
						|
          cursor = document;
 | 
						|
        else
 | 
						|
          cursor = close.getParentNode();
 | 
						|
 | 
						|
        // Insert the copies of the opened nodes.
 | 
						|
        Iterator iter = open.iterator();
 | 
						|
        while (iter.hasNext())
 | 
						|
          {
 | 
						|
            Node item = (Node) iter.next();
 | 
						|
            cursor.appendChild(item);
 | 
						|
            cursor = item;
 | 
						|
          }
 | 
						|
      }
 | 
						|
  }
 | 
						|
 | 
						|
  /**
 | 
						|
   * Handle the start tag by inserting the HTML element.
 | 
						|
   * @param tag the tag to handle.
 | 
						|
   */
 | 
						|
  protected void handleStartTag(TagElement tag)
 | 
						|
  {
 | 
						|
    HTML.Tag h = tag.getHTMLTag();
 | 
						|
    Node c = createNode(h.toString());
 | 
						|
    cursor.appendChild(c);
 | 
						|
    cursor = c;
 | 
						|
  }
 | 
						|
 | 
						|
  /**
 | 
						|
   * Handle text by inserting the text node.
 | 
						|
   * @param text the text to insert.
 | 
						|
   */
 | 
						|
  protected void handleText(char[] text)
 | 
						|
  {
 | 
						|
    Node c = document.createTextNode(text, 0, text.length);
 | 
						|
    cursor.appendChild(c);
 | 
						|
  }
 | 
						|
}
 |