Search in sources :

Example 1 with XmlDocumentHandler

use of gate.xml.XmlDocumentHandler in project gate-core by GateNLP.

the class TikaFormat method unpackMarkup.

@Override
public void unpackMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo) throws DocumentFormatException {
    if (doc == null || doc.getSourceUrl() == null) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // This is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    XmlDocumentHandler ch = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
    Metadata metadata = extractParserTips(doc);
    ch.addStatusListener(statusListener);
    ch.setRepositioningInfo(repInfo);
    // set the object with ampersand coding positions
    ch.setAmpCodingInfo(ampCodingInfo);
    InputStream input = null;
    try {
        Parser tikaParser = new TikaConfig().getParser();
        input = doc.getSourceUrl().openStream();
        tikaParser.parse(input, ch, metadata, new ParseContext());
        setDocumentFeatures(metadata, doc);
    } catch (IOException e) {
        throw new DocumentFormatException(e);
    } catch (SAXException e) {
        throw new DocumentFormatException(e);
    } catch (TikaException e) {
        throw new DocumentFormatException(e);
    } finally {
        // null safe
        IOUtils.closeQuietly(input);
        ch.removeStatusListener(statusListener);
    }
    if (doc instanceof DocumentImpl) {
        ((DocumentImpl) doc).setNextAnnotationId(ch.getCustomObjectsId());
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) InputStream(java.io.InputStream) XmlDocumentHandler(gate.xml.XmlDocumentHandler) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) Parser(org.apache.tika.parser.Parser) SAXException(org.xml.sax.SAXException) DocumentFormatException(gate.util.DocumentFormatException) ParseContext(org.apache.tika.parser.ParseContext) StatusListener(gate.event.StatusListener)

Example 2 with XmlDocumentHandler

use of gate.xml.XmlDocumentHandler in project gate-core by GateNLP.

the class SgmlDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format (e.g. SGML) into annotations in GATE format.
 * Uses the markupElementsMap to determine which elements to convert, and
 * what annotation type names to use.
 * The doc's content is first converted to a wel formed XML.
 * If this succeddes then the document is saved into a temp file and parsed
 * as an XML document.
 *
 * @param doc The gate document you want to parse.
 */
@Override
public void unpackMarkup(Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    // Create a status listener
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            fireStatusChanged(text);
        }
    };
    XmlDocumentHandler xmlDocHandler = null;
    try {
        Sgml2Xml sgml2Xml = new Sgml2Xml(doc);
        fireStatusChanged("Performing SGML to XML...");
        // convert the SGML document
        String xmlUri = sgml2Xml.convert();
        fireStatusChanged("DONE !");
        // Out.println("Conversion done..." + xmlUri);
        // Out.println(sgml2Xml.convert());
        // Get a parser factory.
        SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
        // Set up the factory to create the appropriate type of parser
        // Set up the factory to create the appropriate type of parser
        // non validating one
        saxParserFactory.setValidating(false);
        // non namesapace aware one
        saxParserFactory.setNamespaceAware(true);
        // Create a SAX parser
        SAXParser parser = saxParserFactory.newSAXParser();
        // use it
        // create a new Xml document handler
        xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
        // register a status listener with it
        xmlDocHandler.addStatusListener(statusListener);
        parser.parse(xmlUri, xmlDocHandler);
        ((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
    } catch (ParserConfigurationException e) {
        throw new DocumentFormatException("XML parser configuration exception ", e);
    } catch (SAXException e) {
        throw new DocumentFormatException(e);
    } catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString());
    } finally {
        if (xmlDocHandler != null)
            xmlDocHandler.removeStatusListener(statusListener);
    }
// End try
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) Sgml2Xml(gate.sgml.Sgml2Xml) XmlDocumentHandler(gate.xml.XmlDocumentHandler) StatusListener(gate.event.StatusListener) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException)

Example 3 with XmlDocumentHandler

use of gate.xml.XmlDocumentHandler in project gate-core by GateNLP.

the class XmlDocumentFormat method unpackGeneralXmlMarkup.

/**
 * Unpack markup from any XML format. The XML elements are translated
 * to annotations on the Original markups annotation set.
 *
 * @param doc the document to process
 * @throws DocumentFormatException
 */
private void unpackGeneralXmlMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo, StatusListener statusListener) throws DocumentFormatException {
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    XmlDocumentHandler xmlDocHandler = null;
    try {
        // use Xerces XML parser with JAXP
        // System.setProperty("javax.xml.parsers.SAXParserFactory",
        // "org.apache.xerces.jaxp.SAXParserFactoryImpl");
        // Get a parser factory.
        SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
        // Set up the factory to create the appropriate type of parser
        // non validating one
        saxParserFactory.setValidating(false);
        // non namesapace aware one
        saxParserFactory.setNamespaceAware(true);
        // create it
        SAXParser xmlParser = saxParserFactory.newSAXParser();
        // Create a new Xml document handler
        xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
        // Register a status listener with it
        xmlDocHandler.addStatusListener(statusListener);
        // set repositioning object
        xmlDocHandler.setRepositioningInfo(repInfo);
        // set the object with ampersand coding positions
        xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
        org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
        // Set up the factory to create the appropriate type of parser
        // non validating one
        // http://xml.org/sax/features/validation set to false
        newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
        // namesapace aware one
        // http://xml.org/sax/features/namespaces set to true
        newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
        newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
        newxmlParser.setContentHandler(xmlDocHandler);
        newxmlParser.setErrorHandler(xmlDocHandler);
        newxmlParser.setDTDHandler(xmlDocHandler);
        newxmlParser.setEntityResolver(xmlDocHandler);
        // Parse the XML Document with the appropriate encoding
        Reader docReader = null;
        try {
            InputSource is;
            if (docHasContentButNoValidURL) {
                // no URL, so parse from string
                is = new InputSource(new StringReader(doc.getContent().toString()));
            } else if (doc instanceof TextualDocument) {
                // textual document - load with user specified encoding
                String docEncoding = ((TextualDocument) doc).getEncoding();
                // don't strip BOM on XML.
                docReader = new InputStreamReader(doc.getSourceUrl().openStream(), docEncoding);
                is = new InputSource(docReader);
                // must set system ID to allow relative URLs (e.g. to a DTD) to
                // work
                is.setSystemId(doc.getSourceUrl().toString());
            } else {
                // let the parser decide the encoding
                is = new InputSource(doc.getSourceUrl().toString());
            }
            newxmlParser.parse(is);
        } finally {
            // make sure the open streams are closed
            if (docReader != null)
                docReader.close();
        }
        // Angel - end
        ((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
    } catch (ParserConfigurationException e) {
        throw new DocumentFormatException("XML parser configuration exception ", e);
    } catch (SAXException e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
            e.printStackTrace(Out.getPrintWriter());
        }
    // if
    } catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl(), e);
    } finally {
        if (xmlDocHandler != null)
            xmlDocHandler.removeStatusListener(statusListener);
    }
// End if else try
}
Also used : InputSource(org.xml.sax.InputSource) InputStreamReader(java.io.InputStreamReader) XmlDocumentHandler(gate.xml.XmlDocumentHandler) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) XMLStreamReader(javax.xml.stream.XMLStreamReader) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) DocumentFormatException(gate.util.DocumentFormatException) TextualDocument(gate.TextualDocument) StringReader(java.io.StringReader) SAXParser(javax.xml.parsers.SAXParser) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXParserFactory(javax.xml.parsers.SAXParserFactory)

Aggregations

DocumentFormatException (gate.util.DocumentFormatException)3 XmlDocumentHandler (gate.xml.XmlDocumentHandler)3 IOException (java.io.IOException)3 SAXException (org.xml.sax.SAXException)3 StatusListener (gate.event.StatusListener)2 TextualDocument (gate.TextualDocument)1 Sgml2Xml (gate.sgml.Sgml2Xml)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 Reader (java.io.Reader)1 StringReader (java.io.StringReader)1 ParserConfigurationException (javax.xml.parsers.ParserConfigurationException)1 SAXParser (javax.xml.parsers.SAXParser)1 SAXParserFactory (javax.xml.parsers.SAXParserFactory)1 XMLStreamReader (javax.xml.stream.XMLStreamReader)1 TikaConfig (org.apache.tika.config.TikaConfig)1 TikaException (org.apache.tika.exception.TikaException)1 Metadata (org.apache.tika.metadata.Metadata)1 ParseContext (org.apache.tika.parser.ParseContext)1 Parser (org.apache.tika.parser.Parser)1