Search in sources :

Example 1 with DocumentContent

use of gate.DocumentContent in project gate-core by GateNLP.

the class DocumentStaxUtils method readGateXmlDocument.

/**
 * Reads GATE XML format data from the given XMLStreamReader and puts
 * the content and annotation sets into the given Document, replacing
 * its current content. The reader must be positioned on the opening
 * GateDocument tag (i.e. the last event was a START_ELEMENT for which
 * getLocalName returns "GateDocument"), and when the method returns
 * the reader will be left positioned on the corresponding closing
 * tag.
 *
 * @param xsr the source of the XML to parse
 * @param doc the document to update
 * @param statusListener optional status listener to receive status
 *          messages
 * @throws XMLStreamException
 */
public static void readGateXmlDocument(XMLStreamReader xsr, Document doc, StatusListener statusListener) throws XMLStreamException {
    DocumentContent savedContent = null;
    // check the precondition
    xsr.require(XMLStreamConstants.START_ELEMENT, null, "GateDocument");
    // process the document features
    xsr.nextTag();
    xsr.require(XMLStreamConstants.START_ELEMENT, null, "GateDocumentFeatures");
    if (statusListener != null) {
        statusListener.statusChanged("Reading document features");
    }
    FeatureMap documentFeatures = readFeatureMap(xsr);
    // read document text, building the map of node IDs to offsets
    xsr.nextTag();
    xsr.require(XMLStreamConstants.START_ELEMENT, null, "TextWithNodes");
    Map<Integer, Long> nodeIdToOffsetMap = new HashMap<Integer, Long>();
    if (statusListener != null) {
        statusListener.statusChanged("Reading document content");
    }
    String documentText = readTextWithNodes(xsr, nodeIdToOffsetMap);
    // save the content, in case anything goes wrong later
    savedContent = doc.getContent();
    // set the document content to the text with nodes text.
    doc.setContent(new DocumentContentImpl(documentText));
    try {
        int numAnnots = 0;
        // process annotation sets, using the node map built above
        Integer maxAnnotId = null;
        // initially, we don't know whether annotation IDs are required or
        // not
        Boolean requireAnnotationIds = null;
        int eventType = xsr.nextTag();
        while (eventType == XMLStreamConstants.START_ELEMENT && xsr.getLocalName().equals("AnnotationSet")) {
            xsr.require(XMLStreamConstants.START_ELEMENT, null, "AnnotationSet");
            String annotationSetName = xsr.getAttributeValue(null, "Name");
            AnnotationSet annotationSet = null;
            if (annotationSetName == null) {
                if (statusListener != null) {
                    statusListener.statusChanged("Reading default annotation set");
                }
                annotationSet = doc.getAnnotations();
            } else {
                if (statusListener != null) {
                    statusListener.statusChanged("Reading \"" + annotationSetName + "\" annotation set");
                }
                annotationSet = doc.getAnnotations(annotationSetName);
            }
            annotationSet.clear();
            SortedSet<Integer> annotIdsInSet = new TreeSet<Integer>();
            requireAnnotationIds = readAnnotationSet(xsr, annotationSet, nodeIdToOffsetMap, annotIdsInSet, requireAnnotationIds);
            if (annotIdsInSet.size() > 0 && (maxAnnotId == null || annotIdsInSet.last().intValue() > maxAnnotId.intValue())) {
                maxAnnotId = annotIdsInSet.last();
            }
            numAnnots += annotIdsInSet.size();
            // readAnnotationSet leaves reader positioned on the
            // </AnnotationSet> tag, so nextTag takes us to either the next
            // <AnnotationSet>, a <RelationSet>, or </GateDocument>
            eventType = xsr.nextTag();
        }
        while (eventType == XMLStreamConstants.START_ELEMENT && xsr.getLocalName().equals("RelationSet")) {
            xsr.require(XMLStreamConstants.START_ELEMENT, null, "RelationSet");
            String relationSetName = xsr.getAttributeValue(null, "Name");
            RelationSet relations = null;
            if (relationSetName == null) {
                if (statusListener != null) {
                    statusListener.statusChanged("Reading relation set for default annotation set");
                }
                relations = doc.getAnnotations().getRelations();
            } else {
                if (statusListener != null) {
                    statusListener.statusChanged("Reading relation set for \"" + relationSetName + "\" annotation set");
                }
                relations = doc.getAnnotations(relationSetName).getRelations();
            }
            SortedSet<Integer> relIdsInSet = new TreeSet<Integer>();
            readRelationSet(xsr, relations, relIdsInSet);
            if (relIdsInSet.size() > 0 && (maxAnnotId == null || relIdsInSet.last().intValue() > maxAnnotId.intValue())) {
                maxAnnotId = relIdsInSet.last();
            }
            numAnnots += relIdsInSet.size();
            // readAnnotationSet leaves reader positioned on the
            // </RelationSet> tag, so nextTag takes us to either the next
            // <RelationSet> or to the </GateDocument>
            eventType = xsr.nextTag();
        }
        // check we are on the end document tag
        xsr.require(XMLStreamConstants.END_ELEMENT, null, "GateDocument");
        doc.setFeatures(documentFeatures);
        // set the ID generator, if doc is a DocumentImpl
        if (doc instanceof DocumentImpl && maxAnnotId != null) {
            ((DocumentImpl) doc).setNextAnnotationId(maxAnnotId.intValue() + 1);
        }
        if (statusListener != null) {
            statusListener.statusChanged("Finished.  " + numAnnots + " annotation(s) processed");
        }
    }// in case of exception, reset document content to the unparsed XML
     catch (XMLStreamException xse) {
        doc.setContent(savedContent);
        throw xse;
    } catch (RuntimeException re) {
        doc.setContent(savedContent);
        throw re;
    }
}
Also used : HashMap(java.util.HashMap) AnnotationSet(gate.AnnotationSet) FeatureMap(gate.FeatureMap) GateRuntimeException(gate.util.GateRuntimeException) XMLStreamException(javax.xml.stream.XMLStreamException) DocumentContent(gate.DocumentContent) TreeSet(java.util.TreeSet) RelationSet(gate.relations.RelationSet)

Aggregations

AnnotationSet (gate.AnnotationSet)1 DocumentContent (gate.DocumentContent)1 FeatureMap (gate.FeatureMap)1 RelationSet (gate.relations.RelationSet)1 GateRuntimeException (gate.util.GateRuntimeException)1 HashMap (java.util.HashMap)1 TreeSet (java.util.TreeSet)1 XMLStreamException (javax.xml.stream.XMLStreamException)1