Search in sources :

Example 1 with TextualDocument

use of gate.TextualDocument in project gate-core by GateNLP.

the class XmlDocumentFormat method unpackGateFormatMarkup.

/**
 * Unpacks markup in the GATE-specific standoff XML markup format.
 *
 * @param doc the document to process
 * @param statusListener optional status listener to receive status
 *          messages
 * @throws DocumentFormatException if a fatal error occurs during
 *           parsing
 */
private void unpackGateFormatMarkup(Document doc, StatusListener statusListener) throws DocumentFormatException {
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    try {
        Reader inputReader = null;
        InputStream inputStream = null;
        XMLStreamReader xsr = null;
        if (docHasContentButNoValidURL) {
            inputReader = new StringReader(doc.getContent().toString());
            xsr = getInputFactory().createXMLStreamReader(inputReader);
        } else if (doc instanceof TextualDocument) {
            String encoding = ((TextualDocument) doc).getEncoding();
            // Don't strip BOM on XML.
            inputReader = new InputStreamReader(doc.getSourceUrl().openStream(), encoding);
            // create stream reader with the URL as system ID, to support
            // relative URLs to e.g. DTD or external entities
            xsr = getInputFactory().createXMLStreamReader(doc.getSourceUrl().toExternalForm(), inputReader);
        } else {
            // not a TextualDocument, so let parser determine encoding
            inputStream = doc.getSourceUrl().openStream();
            xsr = getInputFactory().createXMLStreamReader(doc.getSourceUrl().toExternalForm(), inputStream);
        }
        // find the opening GateDocument tag
        xsr.nextTag();
        // parse the document
        try {
            DocumentStaxUtils.readGateXmlDocument(xsr, doc, statusListener);
        } finally {
            xsr.close();
            if (inputStream != null) {
                inputStream.close();
            }
            if (inputReader != null) {
                inputReader.close();
            }
        }
    } catch (XMLStreamException e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
            e.printStackTrace(Out.getPrintWriter());
        }
    // if
    } catch (IOException ioe) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), ioe);
    }
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) XMLStreamReader(javax.xml.stream.XMLStreamReader) InputStreamReader(java.io.InputStreamReader) XMLStreamException(javax.xml.stream.XMLStreamException) InputStream(java.io.InputStream) TextualDocument(gate.TextualDocument) StringReader(java.io.StringReader) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) XMLStreamReader(javax.xml.stream.XMLStreamReader) IOException(java.io.IOException)

Example 2 with TextualDocument

use of gate.TextualDocument in project gate-core by GateNLP.

the class DocumentStaxUtils method writeDocument.

public static void writeDocument(Document doc, OutputStream outputStream, String namespaceURI) throws XMLStreamException, IOException {
    if (outputFactory == null) {
        outputFactory = XMLOutputFactory.newInstance();
    }
    XMLStreamWriter xsw = null;
    try {
        if (doc instanceof TextualDocument) {
            xsw = outputFactory.createXMLStreamWriter(outputStream, ((TextualDocument) doc).getEncoding());
            xsw.writeStartDocument(((TextualDocument) doc).getEncoding(), "1.0");
        } else {
            xsw = outputFactory.createXMLStreamWriter(outputStream);
            xsw.writeStartDocument("1.0");
        }
        newLine(xsw);
        writeDocument(doc, xsw, namespaceURI);
    } finally {
        if (xsw != null) {
            xsw.close();
        }
    }
}
Also used : XMLStreamWriter(javax.xml.stream.XMLStreamWriter) TextualDocument(gate.TextualDocument)

Example 3 with TextualDocument

use of gate.TextualDocument in project gate-core by GateNLP.

the class DocumentStaxUtils method toXml.

/**
 * Returns a string containing the specified document in GATE XML
 * format.
 *
 * @param doc the document
 */
public static String toXml(Document doc) {
    try {
        if (outputFactory == null) {
            outputFactory = XMLOutputFactory.newInstance();
        }
        StringWriter sw = new StringWriter(doc.getContent().size().intValue() * DocumentXmlUtils.DOC_SIZE_MULTIPLICATION_FACTOR);
        XMLStreamWriter xsw = outputFactory.createXMLStreamWriter(sw);
        // start the document
        if (doc instanceof TextualDocument) {
            xsw.writeStartDocument(((TextualDocument) doc).getEncoding(), "1.0");
        } else {
            xsw.writeStartDocument("1.0");
        }
        newLine(xsw);
        writeDocument(doc, xsw, "");
        xsw.close();
        return sw.toString();
    } catch (XMLStreamException xse) {
        throw new GateRuntimeException("Error converting document to XML", xse);
    }
}
Also used : StringWriter(java.io.StringWriter) XMLStreamException(javax.xml.stream.XMLStreamException) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) TextualDocument(gate.TextualDocument) GateRuntimeException(gate.util.GateRuntimeException)

Example 4 with TextualDocument

use of gate.TextualDocument in project gate-core by GateNLP.

the class NekoHtmlDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format into annotations in GATE format. If the document was
 * created from a String, then is recomandable to set the doc's
 * sourceUrl to <b>null</b>. So, if the document has a valid URL,
 * then the parser will try to parse the XML document pointed by the
 * URL.If the URL is not valid, or is null, then the doc's content
 * will be parsed. If the doc's content is not a valid XML then the
 * parser might crash.
 *
 * @param doc The gate document you want to parse. If
 *          <code>doc.getSourceUrl()</code> returns <b>null</b>
 *          then the content of doc will be parsed. Using a URL is
 *          recomended because the parser will report errors corectlly
 *          if the document is not well formed.
 */
@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // This is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    NekoHtmlDocumentHandler handler = null;
    try {
        org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
        // convert element and attribute names to lower case
        parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
        // make parser augment infoset with location information
        parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
        // Create a new Xml document handler
        handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
        // Register a status listener with it
        handler.addStatusListener(statusListener);
        // set repositioning object
        handler.setRepositioningInfo(repInfo);
        // set the object with ampersand coding positions
        handler.setAmpCodingInfo(ampCodingInfo);
        // construct the list of offsets for each line of the document
        int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
        handler.setLineOffsets(lineOffsets);
        // set the handlers
        parser.setDocumentHandler(handler);
        parser.setErrorHandler(handler);
        // Parse the XML Document with the appropriate encoding
        XMLInputSource is;
        if (docHasContentButNoValidURL) {
            // no URL, so parse from string
            is = new XMLInputSource(null, null, null, new StringReader(doc.getContent().toString()), null);
        } else if (doc instanceof TextualDocument) {
            // textual document - load with user specified encoding
            String docEncoding = ((TextualDocument) doc).getEncoding();
            // XML, so no BOM stripping.
            URLConnection conn = doc.getSourceUrl().openConnection();
            InputStream uStream = conn.getInputStream();
            if ("gzip".equals(conn.getContentEncoding())) {
                uStream = new GZIPInputStream(uStream);
            }
            Reader docReader = new InputStreamReader(uStream, docEncoding);
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), docReader, docEncoding);
            // since we control the encoding, tell the parser to ignore any
            // meta http-equiv hints
            parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
        } else {
            // let the parser decide the encoding
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
        }
        /* The following line can forward an
       * ArrayIndexOutOfBoundsException from
       * org.cyberneko.html.HTMLConfiguration.parse and crash GATE.    */
        parser.parse(is);
        // Angel - end
        ((DocumentImpl) doc).setNextAnnotationId(handler.getCustomObjectsId());
    }/* Handle IOException specially.      */
     catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
    }/* Handle XNIException and ArrayIndexOutOfBoundsException:
     * flag the parsing error and keep going.     */
     catch (Exception e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
            e.printStackTrace(Out.getPrintWriter());
        }
    // if
    } finally {
        if (handler != null)
            handler.removeStatusListener(statusListener);
    }
// End if else try
}
Also used : InputStreamReader(java.io.InputStreamReader) XMLInputSource(org.apache.xerces.xni.parser.XMLInputSource) GZIPInputStream(java.util.zip.GZIPInputStream) InputStream(java.io.InputStream) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) IOException(java.io.IOException) URLConnection(java.net.URLConnection) ResourceInstantiationException(gate.creole.ResourceInstantiationException) IOException(java.io.IOException) DocumentFormatException(gate.util.DocumentFormatException) DocumentFormatException(gate.util.DocumentFormatException) NekoHtmlDocumentHandler(gate.html.NekoHtmlDocumentHandler) GZIPInputStream(java.util.zip.GZIPInputStream) TextualDocument(gate.TextualDocument) StringReader(java.io.StringReader) StatusListener(gate.event.StatusListener)

Example 5 with TextualDocument

use of gate.TextualDocument in project gate-core by GateNLP.

the class XmlDocumentFormat method unpackGeneralXmlMarkup.

/**
 * Unpack markup from any XML format. The XML elements are translated
 * to annotations on the Original markups annotation set.
 *
 * @param doc the document to process
 * @throws DocumentFormatException
 */
private void unpackGeneralXmlMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo, StatusListener statusListener) throws DocumentFormatException {
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    XmlDocumentHandler xmlDocHandler = null;
    try {
        // use Xerces XML parser with JAXP
        // System.setProperty("javax.xml.parsers.SAXParserFactory",
        // "org.apache.xerces.jaxp.SAXParserFactoryImpl");
        // Get a parser factory.
        SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
        // Set up the factory to create the appropriate type of parser
        // non validating one
        saxParserFactory.setValidating(false);
        // non namesapace aware one
        saxParserFactory.setNamespaceAware(true);
        // create it
        SAXParser xmlParser = saxParserFactory.newSAXParser();
        // Create a new Xml document handler
        xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
        // Register a status listener with it
        xmlDocHandler.addStatusListener(statusListener);
        // set repositioning object
        xmlDocHandler.setRepositioningInfo(repInfo);
        // set the object with ampersand coding positions
        xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
        org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
        // Set up the factory to create the appropriate type of parser
        // non validating one
        // http://xml.org/sax/features/validation set to false
        newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
        // namesapace aware one
        // http://xml.org/sax/features/namespaces set to true
        newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
        newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
        newxmlParser.setContentHandler(xmlDocHandler);
        newxmlParser.setErrorHandler(xmlDocHandler);
        newxmlParser.setDTDHandler(xmlDocHandler);
        newxmlParser.setEntityResolver(xmlDocHandler);
        // Parse the XML Document with the appropriate encoding
        Reader docReader = null;
        try {
            InputSource is;
            if (docHasContentButNoValidURL) {
                // no URL, so parse from string
                is = new InputSource(new StringReader(doc.getContent().toString()));
            } else if (doc instanceof TextualDocument) {
                // textual document - load with user specified encoding
                String docEncoding = ((TextualDocument) doc).getEncoding();
                // don't strip BOM on XML.
                docReader = new InputStreamReader(doc.getSourceUrl().openStream(), docEncoding);
                is = new InputSource(docReader);
                // must set system ID to allow relative URLs (e.g. to a DTD) to
                // work
                is.setSystemId(doc.getSourceUrl().toString());
            } else {
                // let the parser decide the encoding
                is = new InputSource(doc.getSourceUrl().toString());
            }
            newxmlParser.parse(is);
        } finally {
            // make sure the open streams are closed
            if (docReader != null)
                docReader.close();
        }
        // Angel - end
        ((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
    } catch (ParserConfigurationException e) {
        throw new DocumentFormatException("XML parser configuration exception ", e);
    } catch (SAXException e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
            e.printStackTrace(Out.getPrintWriter());
        }
    // if
    } catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl(), e);
    } finally {
        if (xmlDocHandler != null)
            xmlDocHandler.removeStatusListener(statusListener);
    }
// End if else try
}
Also used : InputSource(org.xml.sax.InputSource) InputStreamReader(java.io.InputStreamReader) XmlDocumentHandler(gate.xml.XmlDocumentHandler) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) XMLStreamReader(javax.xml.stream.XMLStreamReader) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) DocumentFormatException(gate.util.DocumentFormatException) TextualDocument(gate.TextualDocument) StringReader(java.io.StringReader) SAXParser(javax.xml.parsers.SAXParser) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXParserFactory(javax.xml.parsers.SAXParserFactory)

Aggregations

TextualDocument (gate.TextualDocument)5 DocumentFormatException (gate.util.DocumentFormatException)3 IOException (java.io.IOException)3 InputStreamReader (java.io.InputStreamReader)3 Reader (java.io.Reader)3 StringReader (java.io.StringReader)3 InputStream (java.io.InputStream)2 XMLStreamException (javax.xml.stream.XMLStreamException)2 XMLStreamReader (javax.xml.stream.XMLStreamReader)2 XMLStreamWriter (javax.xml.stream.XMLStreamWriter)2 ResourceInstantiationException (gate.creole.ResourceInstantiationException)1 StatusListener (gate.event.StatusListener)1 NekoHtmlDocumentHandler (gate.html.NekoHtmlDocumentHandler)1 GateRuntimeException (gate.util.GateRuntimeException)1 XmlDocumentHandler (gate.xml.XmlDocumentHandler)1 StringWriter (java.io.StringWriter)1 URLConnection (java.net.URLConnection)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)1 SAXParser (javax.xml.parsers.SAXParser)1