Search in sources :

Example 1 with TextualDocument

use of gate.TextualDocument in project gate-core by GateNLP.

the class XmlDocumentFormat method unpackGateFormatMarkup.

 * Unpacks markup in the GATE-specific standoff XML markup format.
 * @param doc the document to process
 * @param statusListener optional status listener to receive status
 *          messages
 * @throws DocumentFormatException if a fatal error occurs during
 *           parsing
private void unpackGateFormatMarkup(Document doc, StatusListener statusListener) throws DocumentFormatException {
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    try {
        Reader inputReader = null;
        InputStream inputStream = null;
        XMLStreamReader xsr = null;
        if (docHasContentButNoValidURL) {
            inputReader = new StringReader(doc.getContent().toString());
            xsr = getInputFactory().createXMLStreamReader(inputReader);
        } else if (doc instanceof TextualDocument) {
            String encoding = ((TextualDocument) doc).getEncoding();
            // Don't strip BOM on XML.
            inputReader = new InputStreamReader(doc.getSourceUrl().openStream(), encoding);
            // create stream reader with the URL as system ID, to support
            // relative URLs to e.g. DTD or external entities
            xsr = getInputFactory().createXMLStreamReader(doc.getSourceUrl().toExternalForm(), inputReader);
        } else {
            // not a TextualDocument, so let parser determine encoding
            inputStream = doc.getSourceUrl().openStream();
            xsr = getInputFactory().createXMLStreamReader(doc.getSourceUrl().toExternalForm(), inputStream);
        // find the opening GateDocument tag
        // parse the document
        try {
            DocumentStaxUtils.readGateXmlDocument(xsr, doc, statusListener);
        } finally {
            if (inputStream != null) {
            if (inputReader != null) {
    } catch (XMLStreamException e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
    // if
    } catch (IOException ioe) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), ioe);
Also used : DocumentFormatException(gate.util.DocumentFormatException) XMLStreamReader( InputStreamReader( XMLStreamException( InputStream( TextualDocument(gate.TextualDocument) StringReader( Reader( InputStreamReader( StringReader( XMLStreamReader( IOException(

Example 2 with TextualDocument

use of gate.TextualDocument in project gate-core by GateNLP.

the class DocumentStaxUtils method writeDocument.

public static void writeDocument(Document doc, OutputStream outputStream, String namespaceURI) throws XMLStreamException, IOException {
    if (outputFactory == null) {
        outputFactory = XMLOutputFactory.newInstance();
    XMLStreamWriter xsw = null;
    try {
        if (doc instanceof TextualDocument) {
            xsw = outputFactory.createXMLStreamWriter(outputStream, ((TextualDocument) doc).getEncoding());
            xsw.writeStartDocument(((TextualDocument) doc).getEncoding(), "1.0");
        } else {
            xsw = outputFactory.createXMLStreamWriter(outputStream);
        writeDocument(doc, xsw, namespaceURI);
    } finally {
        if (xsw != null) {
Also used : XMLStreamWriter( TextualDocument(gate.TextualDocument)

Example 3 with TextualDocument

use of gate.TextualDocument in project gate-core by GateNLP.

the class DocumentStaxUtils method toXml.

 * Returns a string containing the specified document in GATE XML
 * format.
 * @param doc the document
public static String toXml(Document doc) {
    try {
        if (outputFactory == null) {
            outputFactory = XMLOutputFactory.newInstance();
        StringWriter sw = new StringWriter(doc.getContent().size().intValue() * DocumentXmlUtils.DOC_SIZE_MULTIPLICATION_FACTOR);
        XMLStreamWriter xsw = outputFactory.createXMLStreamWriter(sw);
        // start the document
        if (doc instanceof TextualDocument) {
            xsw.writeStartDocument(((TextualDocument) doc).getEncoding(), "1.0");
        } else {
        writeDocument(doc, xsw, "");
        return sw.toString();
    } catch (XMLStreamException xse) {
        throw new GateRuntimeException("Error converting document to XML", xse);
Also used : StringWriter( XMLStreamException( XMLStreamWriter( TextualDocument(gate.TextualDocument) GateRuntimeException(gate.util.GateRuntimeException)

Example 4 with TextualDocument

use of gate.TextualDocument in project gate-core by GateNLP.

the class NekoHtmlDocumentFormat method unpackMarkup.

 * Unpack the markup in the document. This converts markup from the
 * native format into annotations in GATE format. If the document was
 * created from a String, then is recomandable to set the doc's
 * sourceUrl to <b>null</b>. So, if the document has a valid URL,
 * then the parser will try to parse the XML document pointed by the
 * URL.If the URL is not valid, or is null, then the doc's content
 * will be parsed. If the doc's content is not a valid XML then the
 * parser might crash.
 * @param doc The gate document you want to parse. If
 *          <code>doc.getSourceUrl()</code> returns <b>null</b>
 *          then the content of doc will be parsed. Using a URL is
 *          recomended because the parser will report errors corectlly
 *          if the document is not well formed.
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        public void statusChanged(String text) {
            // This is implemented in and inherited here
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    NekoHtmlDocumentHandler handler = null;
    try {
        org.cyberneko.html.HTMLConfiguration parser = new HTMLConfiguration();
        // convert element and attribute names to lower case
        parser.setProperty("", "lower");
        parser.setProperty("", "lower");
        // make parser augment infoset with location information
        parser.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
        // Create a new Xml document handler
        handler = new NekoHtmlDocumentHandler(doc, null, ignorableTags);
        // Register a status listener with it
        // set repositioning object
        // set the object with ampersand coding positions
        // construct the list of offsets for each line of the document
        int[] lineOffsets = buildLineOffsets(doc.getContent().toString());
        // set the handlers
        // Parse the XML Document with the appropriate encoding
        XMLInputSource is;
        if (docHasContentButNoValidURL) {
            // no URL, so parse from string
            is = new XMLInputSource(null, null, null, new StringReader(doc.getContent().toString()), null);
        } else if (doc instanceof TextualDocument) {
            // textual document - load with user specified encoding
            String docEncoding = ((TextualDocument) doc).getEncoding();
            // XML, so no BOM stripping.
            URLConnection conn = doc.getSourceUrl().openConnection();
            InputStream uStream = conn.getInputStream();
            if ("gzip".equals(conn.getContentEncoding())) {
                uStream = new GZIPInputStream(uStream);
            Reader docReader = new InputStreamReader(uStream, docEncoding);
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString(), docReader, docEncoding);
            // since we control the encoding, tell the parser to ignore any
            // meta http-equiv hints
            parser.setFeature("", true);
        } else {
            // let the parser decide the encoding
            is = new XMLInputSource(null, doc.getSourceUrl().toString(), doc.getSourceUrl().toString());
        /* The following line can forward an
       * ArrayIndexOutOfBoundsException from
       * org.cyberneko.html.HTMLConfiguration.parse and crash GATE.    */
        // Angel - end
        ((DocumentImpl) doc).setNextAnnotationId(handler.getCustomObjectsId());
    }/* Handle IOException specially.      */
     catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString(), e);
    }/* Handle XNIException and ArrayIndexOutOfBoundsException:
     * flag the parsing error and keep going.     */
     catch (Exception e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
    // if
    } finally {
        if (handler != null)
// End if else try
Also used : InputStreamReader( XMLInputSource(org.apache.xerces.xni.parser.XMLInputSource) GZIPInputStream( InputStream( HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) Reader( InputStreamReader( StringReader( HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) IOException( URLConnection( ResourceInstantiationException(gate.creole.ResourceInstantiationException) IOException( DocumentFormatException(gate.util.DocumentFormatException) DocumentFormatException(gate.util.DocumentFormatException) NekoHtmlDocumentHandler(gate.html.NekoHtmlDocumentHandler) GZIPInputStream( TextualDocument(gate.TextualDocument) StringReader( StatusListener(gate.event.StatusListener)

Example 5 with TextualDocument

use of gate.TextualDocument in project gate-core by GateNLP.

the class XmlDocumentFormat method unpackGeneralXmlMarkup.

 * Unpack markup from any XML format. The XML elements are translated
 * to annotations on the Original markups annotation set.
 * @param doc the document to process
 * @throws DocumentFormatException
private void unpackGeneralXmlMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo, StatusListener statusListener) throws DocumentFormatException {
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    XmlDocumentHandler xmlDocHandler = null;
    try {
        // use Xerces XML parser with JAXP
        // System.setProperty("javax.xml.parsers.SAXParserFactory",
        // "org.apache.xerces.jaxp.SAXParserFactoryImpl");
        // Get a parser factory.
        SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
        // Set up the factory to create the appropriate type of parser
        // non validating one
        // non namesapace aware one
        // create it
        SAXParser xmlParser = saxParserFactory.newSAXParser();
        // Create a new Xml document handler
        xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
        // Register a status listener with it
        // set repositioning object
        // set the object with ampersand coding positions
        org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
        // Set up the factory to create the appropriate type of parser
        // non validating one
        // set to false
        newxmlParser.setFeature("", false);
        // namesapace aware one
        // set to true
        newxmlParser.setFeature("", true);
        newxmlParser.setFeature("", true);
        // Parse the XML Document with the appropriate encoding
        Reader docReader = null;
        try {
            InputSource is;
            if (docHasContentButNoValidURL) {
                // no URL, so parse from string
                is = new InputSource(new StringReader(doc.getContent().toString()));
            } else if (doc instanceof TextualDocument) {
                // textual document - load with user specified encoding
                String docEncoding = ((TextualDocument) doc).getEncoding();
                // don't strip BOM on XML.
                docReader = new InputStreamReader(doc.getSourceUrl().openStream(), docEncoding);
                is = new InputSource(docReader);
                // must set system ID to allow relative URLs (e.g. to a DTD) to
                // work
            } else {
                // let the parser decide the encoding
                is = new InputSource(doc.getSourceUrl().toString());
        } finally {
            // make sure the open streams are closed
            if (docReader != null)
        // Angel - end
        ((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
    } catch (ParserConfigurationException e) {
        throw new DocumentFormatException("XML parser configuration exception ", e);
    } catch (SAXException e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
    // if
    } catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl(), e);
    } finally {
        if (xmlDocHandler != null)
// End if else try
Also used : InputSource(org.xml.sax.InputSource) InputStreamReader( XmlDocumentHandler(gate.xml.XmlDocumentHandler) Reader( InputStreamReader( StringReader( XMLStreamReader( IOException( SAXException(org.xml.sax.SAXException) DocumentFormatException(gate.util.DocumentFormatException) TextualDocument(gate.TextualDocument) StringReader( SAXParser(javax.xml.parsers.SAXParser) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXParserFactory(javax.xml.parsers.SAXParserFactory)


TextualDocument (gate.TextualDocument)5 DocumentFormatException (gate.util.DocumentFormatException)3 IOException ( InputStreamReader ( Reader ( StringReader ( InputStream ( XMLStreamException ( XMLStreamReader ( XMLStreamWriter ( ResourceInstantiationException (gate.creole.ResourceInstantiationException)1 StatusListener (gate.event.StatusListener)1 NekoHtmlDocumentHandler (gate.html.NekoHtmlDocumentHandler)1 GateRuntimeException (gate.util.GateRuntimeException)1 XmlDocumentHandler (gate.xml.XmlDocumentHandler)1 StringWriter ( URLConnection ( GZIPInputStream ( ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)1 SAXParser (javax.xml.parsers.SAXParser)1