use of gate.DocumentContent in project gate-core by GateNLP.
the class DocumentStaxUtils method readGateXmlDocument.
/**
* Reads GATE XML format data from the given XMLStreamReader and puts
* the content and annotation sets into the given Document, replacing
* its current content. The reader must be positioned on the opening
* GateDocument tag (i.e. the last event was a START_ELEMENT for which
* getLocalName returns "GateDocument"), and when the method returns
* the reader will be left positioned on the corresponding closing
* tag.
*
* @param xsr the source of the XML to parse
* @param doc the document to update
* @param statusListener optional status listener to receive status
* messages
* @throws XMLStreamException
*/
public static void readGateXmlDocument(XMLStreamReader xsr, Document doc, StatusListener statusListener) throws XMLStreamException {
DocumentContent savedContent = null;
// check the precondition
xsr.require(XMLStreamConstants.START_ELEMENT, null, "GateDocument");
// process the document features
xsr.nextTag();
xsr.require(XMLStreamConstants.START_ELEMENT, null, "GateDocumentFeatures");
if (statusListener != null) {
statusListener.statusChanged("Reading document features");
}
FeatureMap documentFeatures = readFeatureMap(xsr);
// read document text, building the map of node IDs to offsets
xsr.nextTag();
xsr.require(XMLStreamConstants.START_ELEMENT, null, "TextWithNodes");
Map<Integer, Long> nodeIdToOffsetMap = new HashMap<Integer, Long>();
if (statusListener != null) {
statusListener.statusChanged("Reading document content");
}
String documentText = readTextWithNodes(xsr, nodeIdToOffsetMap);
// save the content, in case anything goes wrong later
savedContent = doc.getContent();
// set the document content to the text with nodes text.
doc.setContent(new DocumentContentImpl(documentText));
try {
int numAnnots = 0;
// process annotation sets, using the node map built above
Integer maxAnnotId = null;
// initially, we don't know whether annotation IDs are required or
// not
Boolean requireAnnotationIds = null;
int eventType = xsr.nextTag();
while (eventType == XMLStreamConstants.START_ELEMENT && xsr.getLocalName().equals("AnnotationSet")) {
xsr.require(XMLStreamConstants.START_ELEMENT, null, "AnnotationSet");
String annotationSetName = xsr.getAttributeValue(null, "Name");
AnnotationSet annotationSet = null;
if (annotationSetName == null) {
if (statusListener != null) {
statusListener.statusChanged("Reading default annotation set");
}
annotationSet = doc.getAnnotations();
} else {
if (statusListener != null) {
statusListener.statusChanged("Reading \"" + annotationSetName + "\" annotation set");
}
annotationSet = doc.getAnnotations(annotationSetName);
}
annotationSet.clear();
SortedSet<Integer> annotIdsInSet = new TreeSet<Integer>();
requireAnnotationIds = readAnnotationSet(xsr, annotationSet, nodeIdToOffsetMap, annotIdsInSet, requireAnnotationIds);
if (annotIdsInSet.size() > 0 && (maxAnnotId == null || annotIdsInSet.last().intValue() > maxAnnotId.intValue())) {
maxAnnotId = annotIdsInSet.last();
}
numAnnots += annotIdsInSet.size();
// readAnnotationSet leaves reader positioned on the
// </AnnotationSet> tag, so nextTag takes us to either the next
// <AnnotationSet>, a <RelationSet>, or </GateDocument>
eventType = xsr.nextTag();
}
while (eventType == XMLStreamConstants.START_ELEMENT && xsr.getLocalName().equals("RelationSet")) {
xsr.require(XMLStreamConstants.START_ELEMENT, null, "RelationSet");
String relationSetName = xsr.getAttributeValue(null, "Name");
RelationSet relations = null;
if (relationSetName == null) {
if (statusListener != null) {
statusListener.statusChanged("Reading relation set for default annotation set");
}
relations = doc.getAnnotations().getRelations();
} else {
if (statusListener != null) {
statusListener.statusChanged("Reading relation set for \"" + relationSetName + "\" annotation set");
}
relations = doc.getAnnotations(relationSetName).getRelations();
}
SortedSet<Integer> relIdsInSet = new TreeSet<Integer>();
readRelationSet(xsr, relations, relIdsInSet);
if (relIdsInSet.size() > 0 && (maxAnnotId == null || relIdsInSet.last().intValue() > maxAnnotId.intValue())) {
maxAnnotId = relIdsInSet.last();
}
numAnnots += relIdsInSet.size();
// readAnnotationSet leaves reader positioned on the
// </RelationSet> tag, so nextTag takes us to either the next
// <RelationSet> or to the </GateDocument>
eventType = xsr.nextTag();
}
// check we are on the end document tag
xsr.require(XMLStreamConstants.END_ELEMENT, null, "GateDocument");
doc.setFeatures(documentFeatures);
// set the ID generator, if doc is a DocumentImpl
if (doc instanceof DocumentImpl && maxAnnotId != null) {
((DocumentImpl) doc).setNextAnnotationId(maxAnnotId.intValue() + 1);
}
if (statusListener != null) {
statusListener.statusChanged("Finished. " + numAnnots + " annotation(s) processed");
}
}// in case of exception, reset document content to the unparsed XML
catch (XMLStreamException xse) {
doc.setContent(savedContent);
throw xse;
} catch (RuntimeException re) {
doc.setContent(savedContent);
throw re;
}
}
Aggregations