use of gate.util.DocumentFormatException in project gate-core by GateNLP.
the class XmlDocumentFormat method unpackGeneralXmlMarkup.
/**
* Unpack markup from any XML format. The XML elements are translated
* to annotations on the Original markups annotation set.
*
* @param doc the document to process
* @throws DocumentFormatException
*/
private void unpackGeneralXmlMarkup(Document doc, RepositioningInfo repInfo, RepositioningInfo ampCodingInfo, StatusListener statusListener) throws DocumentFormatException {
boolean docHasContentButNoValidURL = hasContentButNoValidUrl(doc);
XmlDocumentHandler xmlDocHandler = null;
try {
// use Xerces XML parser with JAXP
// System.setProperty("javax.xml.parsers.SAXParserFactory",
// "org.apache.xerces.jaxp.SAXParserFactoryImpl");
// Get a parser factory.
SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
// Set up the factory to create the appropriate type of parser
// non validating one
saxParserFactory.setValidating(false);
// non namesapace aware one
saxParserFactory.setNamespaceAware(true);
// create it
SAXParser xmlParser = saxParserFactory.newSAXParser();
// Create a new Xml document handler
xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
// Register a status listener with it
xmlDocHandler.addStatusListener(statusListener);
// set repositioning object
xmlDocHandler.setRepositioningInfo(repInfo);
// set the object with ampersand coding positions
xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
// Set up the factory to create the appropriate type of parser
// non validating one
// http://xml.org/sax/features/validation set to false
newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
// namesapace aware one
// http://xml.org/sax/features/namespaces set to true
newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
newxmlParser.setContentHandler(xmlDocHandler);
newxmlParser.setErrorHandler(xmlDocHandler);
newxmlParser.setDTDHandler(xmlDocHandler);
newxmlParser.setEntityResolver(xmlDocHandler);
// Parse the XML Document with the appropriate encoding
Reader docReader = null;
try {
InputSource is;
if (docHasContentButNoValidURL) {
// no URL, so parse from string
is = new InputSource(new StringReader(doc.getContent().toString()));
} else if (doc instanceof TextualDocument) {
// textual document - load with user specified encoding
String docEncoding = ((TextualDocument) doc).getEncoding();
// don't strip BOM on XML.
docReader = new InputStreamReader(doc.getSourceUrl().openStream(), docEncoding);
is = new InputSource(docReader);
// must set system ID to allow relative URLs (e.g. to a DTD) to
// work
is.setSystemId(doc.getSourceUrl().toString());
} else {
// let the parser decide the encoding
is = new InputSource(doc.getSourceUrl().toString());
}
newxmlParser.parse(is);
} finally {
// make sure the open streams are closed
if (docReader != null)
docReader.close();
}
// Angel - end
((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
} catch (ParserConfigurationException e) {
throw new DocumentFormatException("XML parser configuration exception ", e);
} catch (SAXException e) {
doc.getFeatures().put("parsingError", Boolean.TRUE);
Boolean bThrow = (Boolean) doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
if (bThrow != null && bThrow.booleanValue()) {
// error
throw new DocumentFormatException(e);
} else {
Out.println("Warning: Document remains unparsed. \n" + "\n Stack Dump: ");
e.printStackTrace(Out.getPrintWriter());
}
// if
} catch (IOException e) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl(), e);
} finally {
if (xmlDocHandler != null)
xmlDocHandler.removeStatusListener(statusListener);
}
// End if else try
}
Aggregations