Search in sources :

Example 11 with DocumentFormatException

use of gate.util.DocumentFormatException in project gate-core by GateNLP.

the class XmlDocumentFormat method unpackGeneralXmlMarkup.

/**
 * Unpack markup from any XML format. The XML elements are translated
 * to annotations on the Original markups annotation set.
 *
 * @param doc the document to process
 * @throws DocumentFormatException
 */
private void unpackGeneralXmlMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo, StatusListener statusListener) throws DocumentFormatException {
    boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
    XmlDocumentHandler xmlDocHandler = null;
    try {
        // use Xerces XML parser with JAXP
        // System.setProperty("javax.xml.parsers.SAXParserFactory",
        // "org.apache.xerces.jaxp.SAXParserFactoryImpl");
        // Get a parser factory.
        SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
        // Set up the factory to create the appropriate type of parser
        // non validating one
        saxParserFactory.setValidating(false);
        // non namesapace aware one
        saxParserFactory.setNamespaceAware(true);
        // create it
        SAXParser xmlParser = saxParserFactory.newSAXParser();
        // Create a new Xml document handler
        xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
        // Register a status listener with it
        xmlDocHandler.addStatusListener(statusListener);
        // set repositioning object
        xmlDocHandler.setRepositioningInfo(repInfo);
        // set the object with ampersand coding positions
        xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
        org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
        // Set up the factory to create the appropriate type of parser
        // non validating one
        // http://xml.org/sax/features/validation set to false
        newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
        // namesapace aware one
        // http://xml.org/sax/features/namespaces set to true
        newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
        newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
        newxmlParser.setContentHandler(xmlDocHandler);
        newxmlParser.setErrorHandler(xmlDocHandler);
        newxmlParser.setDTDHandler(xmlDocHandler);
        newxmlParser.setEntityResolver(xmlDocHandler);
        // Parse the XML Document with the appropriate encoding
        Reader docReader = null;
        try {
            InputSource is;
            if (docHasContentButNoValidURL) {
                // no URL, so parse from string
                is = new InputSource(new StringReader(doc.getContent().toString()));
            } else if (doc instanceof TextualDocument) {
                // textual document - load with user specified encoding
                String docEncoding = ((TextualDocument) doc).getEncoding();
                // don't strip BOM on XML.
                docReader = new InputStreamReader(doc.getSourceUrl().openStream(), docEncoding);
                is = new InputSource(docReader);
                // must set system ID to allow relative URLs (e.g. to a DTD) to
                // work
                is.setSystemId(doc.getSourceUrl().toString());
            } else {
                // let the parser decide the encoding
                is = new InputSource(doc.getSourceUrl().toString());
            }
            newxmlParser.parse(is);
        } finally {
            // make sure the open streams are closed
            if (docReader != null)
                docReader.close();
        }
        // Angel - end
        ((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
    } catch (ParserConfigurationException e) {
        throw new DocumentFormatException("XML parser configuration exception ", e);
    } catch (SAXException e) {
        doc.getFeatures().put("parsingError", Boolean.TRUE);
        Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
        if (bThrow != null && bThrow.booleanValue()) {
            // error
            throw new DocumentFormatException(e);
        } else {
            Out.println("Warning: Document remains unparsed. \n" + "\n  Stack Dump: ");
            e.printStackTrace(Out.getPrintWriter());
        }
    // if
    } catch (IOException e) {
        throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl(), e);
    } finally {
        if (xmlDocHandler != null)
            xmlDocHandler.removeStatusListener(statusListener);
    }
// End if else try
}
Also used : InputSource(org.xml.sax.InputSource) InputStreamReader(java.io.InputStreamReader) XmlDocumentHandler(gate.xml.XmlDocumentHandler) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) StringReader(java.io.StringReader) XMLStreamReader(javax.xml.stream.XMLStreamReader) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) DocumentFormatException(gate.util.DocumentFormatException) TextualDocument(gate.TextualDocument) StringReader(java.io.StringReader) SAXParser(javax.xml.parsers.SAXParser) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException) SAXParserFactory(javax.xml.parsers.SAXParserFactory)

Aggregations

DocumentFormatException (gate.util.DocumentFormatException)11 IOException (java.io.IOException)7 StatusListener (gate.event.StatusListener)6 TextualDocument (gate.TextualDocument)3 InvalidOffsetException (gate.util.InvalidOffsetException)3 XmlDocumentHandler (gate.xml.XmlDocumentHandler)3 InputStream (java.io.InputStream)3 InputStreamReader (java.io.InputStreamReader)3 Reader (java.io.Reader)3 StringReader (java.io.StringReader)3 SAXException (org.xml.sax.SAXException)3 Annotation (gate.Annotation)2 AnnotationSet (gate.AnnotationSet)2 ResourceInstantiationException (gate.creole.ResourceInstantiationException)2 XMLStreamReader (javax.xml.stream.XMLStreamReader)2 gate (gate)1 DocumentFormat (gate.DocumentFormat)1 FeatureMap (gate.FeatureMap)1 EmailDocumentHandler (gate.email.EmailDocumentHandler)1 NekoHtmlDocumentHandler (gate.html.NekoHtmlDocumentHandler)1