Search in sources :

Example 1 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class XmlDocumentFormat method unpackMarkup.

// unpackMarkup
/**
 * Unpack the markup in the document. This converts markup from the
 * native format (e.g. XML) into annotations in GATE format. Uses the
 * markupElementsMap to determine which elements to convert, and what
 * annotation type names to use. If the document was created from a
 * String, then is recomandable to set the doc's sourceUrl to <b>null</b>.
 * So, if the document has a valid URL, then the parser will try to
 * parse the XML document pointed by the URL.If the URL is not valid,
 * or is null, then the doc's content will be parsed. If the doc's
 * content is not a valid XML then the parser might crash.
 *
 * @param doc The gate document you want to parse. If
 *          <code>doc.getSourceUrl()</code> returns <b>null</b>
 *          then the content of doc will be parsed. Using a URL is
 *          recomended because the parser will report errors corectlly
 *          if the XML document is not well formed.
 */
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // This is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    // determine whether we have a GATE format XML document or another
    // kind
    String content = doc.getContent().toString();
    if (content.length() > 2048) {
        content = content.substring(0, 2048);
    }
    boolean gateFormat = isGateXmlFormat(content);
    if (gateFormat) {
        unpackGateFormatMarkup(doc, statusListener);
    } else {
        unpackGeneralXmlMarkup(doc, repInfo, ampCodingInfo, statusListener);
    }
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) StatusListener(gate.event.StatusListener)

Example 2 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class XmlDocumentFormat method unpackGateFormatMarkup.

/**
 * Unpacks markup in the GATE-specific standoff XML markup format.
 *
 * @param doc the document to process
 * @param statusListener optional status listener to receive status
 *          messages
 * @throws DocumentFormatException if a fatal error occurs during
 *           parsing
 */
private void unpackGateFormatMarkup(Document doc, StatusListener statusListener) throws DocumentFormatException {
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    try {
        Reader inputReader = null;
        InputStream inputStream = null;
        XMLStreamReader xsr = null;
        if (docHasContentButNoValidURL) {
            inputReader = new StringReader(doc.getContent().toString());
            xsr = getInputFactory().createXMLStreamReader(inputReader);
        } else if (doc instanceof TextualDocument) {
            String encoding = ((TextualDocument) doc).getEncoding();
            // Don't strip BOM on XML.
            inputReader = new InputStreamReader(doc.getSourceUrl().openStream(), encoding);
            // create stream reader with the URL as system ID, to support
            // relative URLs to e.g. DTD or external entities
            xsr = getInputFactory().createXMLStreamReader(doc.getSourceUrl().toExternalForm(), inputReader);
        } else {
            // not a TextualDocument, so let parser determine encoding
            inputStream = doc.getSourceUrl().openStream();
            xsr = getInputFactory().createXMLStreamReader(doc.getSourceUrl().toExternalForm(), inputStream);
        }
        // find the opening GateDocument tag
        xsr.nextTag();
        // parse the document
        try {
            DocumentStaxUtils.readGateXmlDocument(xsr, doc, statusListener);
        } finally {
            xsr.close();
            if (inputStream != null) {
                inputStream.close();
            }
            if (inputReader != null) {
                inputReader.close();
            }
        }
    } catch (XMLStreamException e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
            e.printStackTrace(Out.getPrintWriter());
        }
    // if
    } catch (IOException ioe) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), ioe);
    }
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) XMLStreamReader(javax.xml.stream.XMLStreamReader) InputStreamReader(java.io.InputStreamReader) XMLStreamException(javax.xml.stream.XMLStreamException) InputStream(java.io.InputStream) TextualDocument(gate.TextualDocument) StringReader(java.io.StringReader) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) XMLStreamReader(javax.xml.stream.XMLStreamReader) IOException(java.io.IOException)

Example 3 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class TikaFormat method unpackMarkup.

@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
    if (doc == null || doc.getSourceUrl() == null) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // This is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    XmlDocumentHandler ch = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
    Metadata metadata = extractParserTips(doc);
    ch.addStatusListener(statusListener);
    ch.setRepositioningInfo(repInfo);
    // set the object with ampersand coding positions
    ch.setAmpCodingInfo(ampCodingInfo);
    InputStream input = null;
    try {
        Parser tikaParser = new TikaConfig().getParser();
        input = doc.getSourceUrl().openStream();
        tikaParser.parse(input, ch, metadata, new ParseContext());
        setDocumentFeatures(metadata, doc);
    } catch (IOException e) {
        throw new DocumentFormatException(e);
    } catch (SAXException e) {
        throw new DocumentFormatException(e);
    } catch (TikaException e) {
        throw new DocumentFormatException(e);
    } finally {
        // null safe
        IOUtils.closeQuietly(input);
        ch.removeStatusListener(statusListener);
    }
    if (doc instanceof DocumentImpl) {
        ((DocumentImpl) doc).setNextAnnotationId(ch.getCustomObjectsId());
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) InputStream(java.io.InputStream) XmlDocumentHandler(gate.xml.XmlDocumentHandler) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) Parser(org.apache.tika.parser.Parser) SAXException(org.xml.sax.SAXException) DocumentFormatException(gate.util.DocumentFormatException) ParseContext(org.apache.tika.parser.ParseContext) StatusListener(gate.event.StatusListener)

Example 4 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class NekoHtmlDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format into annotations in GATE format. If the document was
 * created from a String, then is recomandable to set the doc's
 * sourceUrl to <b>null</b>. So, if the document has a valid URL,
 * then the parser will try to parse the XML document pointed by the
 * URL.If the URL is not valid, or is null, then the doc's content
 * will be parsed. If the doc's content is not a valid XML then the
 * parser might crash.
 *
 * @param doc The gate document you want to parse. If
 *          <code>doc.getSourceUrl()</code> returns <b>null</b>
 *          then the content of doc will be parsed. Using a URL is
 *          recomended because the parser will report errors corectlly
 *          if the document is not well formed.
 */
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // This is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    NekoHtmlDocumentHandler handler = null;
    try {
        org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
        // convert element and attribute names to lower case
        parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
        // make parser augment infoset with location information
        parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
        // Create a new Xml document handler
        handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
        // Register a status listener with it
        handler.addStatusListener(statusListener);
        // set repositioning object
        handler.setRepositioningInfo(repInfo);
        // set the object with ampersand coding positions
        handler.setAmpCodingInfo(ampCodingInfo);
        // construct the list of offsets for each line of the document
        int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
        handler.setLineOffsets(lineOffsets);
        // set the handlers
        parser.setDocumentHandler(handler);
        parser.setErrorHandler(handler);
        // Parse the XML Document with the appropriate encoding
        XMLInputSource is;
        if (docHasContentButNoValidURL) {
            // no URL, so parse from string
            is = new XMLInputSource(null, null, null, new StringReader(doc.getContent().toString()), null);
        } else if (doc instanceof TextualDocument) {
            // textual document - load with user specified encoding
            String docEncoding = ((TextualDocument) doc).getEncoding();
            // XML, so no BOM stripping.
            URLConnection conn = doc.getSourceUrl().openConnection();
            InputStream uStream = conn.getInputStream();
            if ("gzip".equals(conn.getContentEncoding())) {
                uStream = new GZIPInputStream(uStream);
            }
            Reader docReader = new InputStreamReader(uStream, docEncoding);
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), docReader, docEncoding);
            // since we control the encoding, tell the parser to ignore any
            // meta http-equiv hints
            parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
        } else {
            // let the parser decide the encoding
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
        }
        /* The following line can forward an
       * ArrayIndexOutOfBoundsException from
       * org.cyberneko.html.HTMLConfiguration.parse and crash GATE.    */
        parser.parse(is);
        // Angel - end
        ((DocumentImpl) doc).setNextAnnotationId(handler.getCustomObjectsId());
    }/* Handle IOException specially.      */
     catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
    }/* Handle XNIException and ArrayIndexOutOfBoundsException:
     * flag the parsing error and keep going.     */
     catch (Exception e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
            e.printStackTrace(Out.getPrintWriter());
        }
    // if
    } finally {
        if (handler != null)
            handler.removeStatusListener(statusListener);
    }
// End if else try
}
Also used : InputStreamReader(java.io.InputStreamReader) XMLInputSource(org.apache.xerces.xni.parser.XMLInputSource) GZIPInputStream(java.util.zip.GZIPInputStream) InputStream(java.io.InputStream) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) IOException(java.io.IOException) URLConnection(java.net.URLConnection) ResourceInstantiationException(gate.creole.ResourceInstantiationException) IOException(java.io.IOException) DocumentFormatException(gate.util.DocumentFormatException) DocumentFormatException(gate.util.DocumentFormatException) NekoHtmlDocumentHandler(gate.html.NekoHtmlDocumentHandler) GZIPInputStream(java.util.zip.GZIPInputStream) TextualDocument(gate.TextualDocument) StringReader(java.io.StringReader) StatusListener(gate.event.StatusListener)

Example 5 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class SgmlDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format (e.g. SGML) into annotations in GATE format.
 * Uses the markupElementsMap to determine which elements to convert, and
 * what annotation type names to use.
 * The doc's content is first converted to a wel formed XML.
 * If this succeddes then the document is saved into a temp file and parsed
 * as an XML document.
 *
 * @param doc The gate document you want to parse.
 */
@Override
public void unpackMarkup(Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            fireStatusChanged(text);
        }
    };
    XmlDocumentHandler xmlDocHandler = null;
    try {
        Sgml2Xml sgml2Xml = new Sgml2Xml(doc);
        fireStatusChanged("Performing SGML to XML...");
        // convert the SGML document
        String xmlUri = sgml2Xml.convert();
        fireStatusChanged("DONE !");
        // Out.println("Conversion done..." + xmlUri);
        // Out.println(sgml2Xml.convert());
        // Get a parser factory.
        SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
        // Set up the factory to create the appropriate type of parser
        // Set up the factory to create the appropriate type of parser
        // non validating one
        saxParserFactory.setValidating(false);
        // non namesapace aware one
        saxParserFactory.setNamespaceAware(true);
        // Create a SAX parser
        SAXParser parser = saxParserFactory.newSAXParser();
        // use it
        // create a new Xml document handler
        xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
        // register a status listener with it
        xmlDocHandler.addStatusListener(statusListener);
        parser.parse(xmlUri, xmlDocHandler);
        ((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
    } catch (ParserConfigurationException e) {
        throw new DocumentFormatException("XML parser configuration exception ", e);
    } catch (SAXException e) {
        throw new DocumentFormatException(e);
    } catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString());
    } finally {
        if (xmlDocHandler != null)
            xmlDocHandler.removeStatusListener(statusListener);
    }
// End try
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) Sgml2Xml(gate.sgml.Sgml2Xml) XmlDocumentHandler(gate.xml.XmlDocumentHandler) StatusListener(gate.event.StatusListener) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException)

Aggregations

DocumentFormatException (gate.util.DocumentFormatException)11 IOException (java.io.IOException)7 StatusListener (gate.event.StatusListener)6 TextualDocument (gate.TextualDocument)3 InvalidOffsetException (gate.util.InvalidOffsetException)3 XmlDocumentHandler (gate.xml.XmlDocumentHandler)3 InputStream (java.io.InputStream)3 InputStreamReader (java.io.InputStreamReader)3 Reader (java.io.Reader)3 StringReader (java.io.StringReader)3 SAXException (org.xml.sax.SAXException)3 Annotation (gate.Annotation)2 AnnotationSet (gate.AnnotationSet)2 ResourceInstantiationException (gate.creole.ResourceInstantiationException)2 XMLStreamReader (javax.xml.stream.XMLStreamReader)2 gate (gate)1 DocumentFormat (gate.DocumentFormat)1 FeatureMap (gate.FeatureMap)1 EmailDocumentHandler (gate.email.EmailDocumentHandler)1 NekoHtmlDocumentHandler (gate.html.NekoHtmlDocumentHandler)1