use of gate.sgml.Sgml2Xml in project gate-core by GateNLP.
the class SgmlDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format (e.g. SGML) into annotations in GATE format.
* Uses the markupElementsMap to determine which elements to convert, and
* what annotation type names to use.
* The doc's content is first converted to a wel formed XML.
* If this succeddes then the document is saved into a temp file and parsed
* as an XML document.
*
* @param doc The gate document you want to parse.
*/
@Override
public void unpackMarkup(Document doc) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
// Create a status listener
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
fireStatusChanged(text);
}
};
XmlDocumentHandler xmlDocHandler = null;
try {
Sgml2Xml sgml2Xml = new Sgml2Xml(doc);
fireStatusChanged("Performing SGML to XML...");
// convert the SGML document
String xmlUri = sgml2Xml.convert();
fireStatusChanged("DONE !");
// Out.println("Conversion done..." + xmlUri);
// Out.println(sgml2Xml.convert());
// Get a parser factory.
SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
// Set up the factory to create the appropriate type of parser
// Set up the factory to create the appropriate type of parser
// non validating one
saxParserFactory.setValidating(false);
// non namesapace aware one
saxParserFactory.setNamespaceAware(true);
// Create a SAX parser
SAXParser parser = saxParserFactory.newSAXParser();
// use it
// create a new Xml document handler
xmlDocHandler = new XmlDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
// register a status listener with it
xmlDocHandler.addStatusListener(statusListener);
parser.parse(xmlUri, xmlDocHandler);
((DocumentImpl) doc).setNextAnnotationId(xmlDocHandler.getCustomObjectsId());
} catch (ParserConfigurationException e) {
throw new DocumentFormatException("XML parser configuration exception ", e);
} catch (SAXException e) {
throw new DocumentFormatException(e);
} catch (IOException e) {
throw new DocumentFormatException("I/O exception for " + doc.getSourceUrl().toString());
} finally {
if (xmlDocHandler != null)
xmlDocHandler.removeStatusListener(statusListener);
}
// End try
}
Aggregations