Search in sources :

Example 1 with NekoHtmlDocumentHandler

use of gate.html.NekoHtmlDocumentHandler in project gate-core by GateNLP.

the class NekoHtmlDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format into annotations in GATE format. If the document was
 * created from a String, then is recomandable to set the doc's
 * sourceUrl to <b>null</b>. So, if the document has a valid URL,
 * then the parser will try to parse the XML document pointed by the
 * URL.If the URL is not valid, or is null, then the doc's content
 * will be parsed. If the doc's content is not a valid XML then the
 * parser might crash.
 *
 * @param doc The gate document you want to parse. If
 *          <code>doc.getSourceUrl()</code> returns <b>null</b>
 *          then the content of doc will be parsed. Using a URL is
 *          recomended because the parser will report errors corectlly
 *          if the document is not well formed.
 */
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // This is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    NekoHtmlDocumentHandler handler = null;
    try {
        org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
        // convert element and attribute names to lower case
        parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
        // make parser augment infoset with location information
        parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
        // Create a new Xml document handler
        handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
        // Register a status listener with it
        handler.addStatusListener(statusListener);
        // set repositioning object
        handler.setRepositioningInfo(repInfo);
        // set the object with ampersand coding positions
        handler.setAmpCodingInfo(ampCodingInfo);
        // construct the list of offsets for each line of the document
        int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
        handler.setLineOffsets(lineOffsets);
        // set the handlers
        parser.setDocumentHandler(handler);
        parser.setErrorHandler(handler);
        // Parse the XML Document with the appropriate encoding
        XMLInputSource is;
        if (docHasContentButNoValidURL) {
            // no URL, so parse from string
            is = new XMLInputSource(null, null, null, new StringReader(doc.getContent().toString()), null);
        } else if (doc instanceof TextualDocument) {
            // textual document - load with user specified encoding
            String docEncoding = ((TextualDocument) doc).getEncoding();
            // XML, so no BOM stripping.
            URLConnection conn = doc.getSourceUrl().openConnection();
            InputStream uStream = conn.getInputStream();
            if ("gzip".equals(conn.getContentEncoding())) {
                uStream = new GZIPInputStream(uStream);
            }
            Reader docReader = new InputStreamReader(uStream, docEncoding);
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), docReader, docEncoding);
            // since we control the encoding, tell the parser to ignore any
            // meta http-equiv hints
            parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
        } else {
            // let the parser decide the encoding
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
        }
        /* The following line can forward an
       * ArrayIndexOutOfBoundsException from
       * org.cyberneko.html.HTMLConfiguration.parse and crash GATE.    */
        parser.parse(is);
        // Angel - end
        ((DocumentImpl) doc).setNextAnnotationId(handler.getCustomObjectsId());
    }/* Handle IOException specially.      */
     catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
    }/* Handle XNIException and ArrayIndexOutOfBoundsException:
     * flag the parsing error and keep going.     */
     catch (Exception e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
            e.printStackTrace(Out.getPrintWriter());
        }
    // if
    } finally {
        if (handler != null)
            handler.removeStatusListener(statusListener);
    }
// End if else try
}
Also used : InputStreamReader(java.io.InputStreamReader) XMLInputSource(org.apache.xerces.xni.parser.XMLInputSource) GZIPInputStream(java.util.zip.GZIPInputStream) InputStream(java.io.InputStream) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) IOException(java.io.IOException) URLConnection(java.net.URLConnection) ResourceInstantiationException(gate.creole.ResourceInstantiationException) IOException(java.io.IOException) DocumentFormatException(gate.util.DocumentFormatException) DocumentFormatException(gate.util.DocumentFormatException) NekoHtmlDocumentHandler(gate.html.NekoHtmlDocumentHandler) GZIPInputStream(java.util.zip.GZIPInputStream) TextualDocument(gate.TextualDocument) StringReader(java.io.StringReader) StatusListener(gate.event.StatusListener)

Aggregations

TextualDocument (gate.TextualDocument)1 ResourceInstantiationException (gate.creole.ResourceInstantiationException)1 StatusListener (gate.event.StatusListener)1 NekoHtmlDocumentHandler (gate.html.NekoHtmlDocumentHandler)1 DocumentFormatException (gate.util.DocumentFormatException)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 Reader (java.io.Reader)1 StringReader (java.io.StringReader)1 URLConnection (java.net.URLConnection)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 XMLInputSource (org.apache.xerces.xni.parser.XMLInputSource)1 HTMLConfiguration (org.cyberneko.html.HTMLConfiguration)1