Search in sources :

Example 21 with AnnotationSet

use of gate.AnnotationSet in project gate-core by GateNLP.

the class DocumentXmlUtils method textWithNodes.

// replaceCharsWithEntities()
/**
 * Returns the document's text interspersed with <Node> elements at all
 * points where the document has an annotation beginning or ending.
 */
public static String textWithNodes(TextualDocument doc, String aText) {
    // getoffsets for XML entities
    if (aText == null)
        return new String("");
    StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
    // Construct a map from offsets to Chars ()
    SortedMap<Long, Character> offsets2CharsMap = new TreeMap<Long, Character>();
    if (aText.length() != 0) {
        // Fill the offsets2CharsMap with all the indices where special chars
        // appear
        buildEntityMapFromString(aText, offsets2CharsMap);
    }
    // End if
    // Construct the offsetsSet for all nodes belonging to this document
    SortedSet<Long> offsetsSet = new TreeSet<Long>();
    Iterator<Annotation> annotSetIter = doc.getAnnotations().iterator();
    while (annotSetIter.hasNext()) {
        Annotation annot = annotSetIter.next();
        offsetsSet.add(annot.getStartNode().getOffset());
        offsetsSet.add(annot.getEndNode().getOffset());
    }
    // end While
    // Get the nodes from all other named annotation sets.
    Map<String, AnnotationSet> namedAnnotSets = doc.getNamedAnnotationSets();
    if (namedAnnotSets != null) {
        Iterator<AnnotationSet> iter = namedAnnotSets.values().iterator();
        while (iter.hasNext()) {
            AnnotationSet annotSet = iter.next();
            Iterator<Annotation> iter2 = annotSet.iterator();
            while (iter2.hasNext()) {
                Annotation annotTmp = iter2.next();
                offsetsSet.add(annotTmp.getStartNode().getOffset());
                offsetsSet.add(annotTmp.getEndNode().getOffset());
            }
        // End while
        }
    // End while
    }
    // is a TreeSet
    if (offsetsSet.isEmpty()) {
        return replaceCharsWithEntities(aText).toString();
    }
    // create a large StringBuffer
    StringBuffer modifiedBuffer = new StringBuffer(textWithNodes.length() * 2);
    // last character copied from the original String
    int lastCharactercopied = 0;
    // append to buffer all text up to next offset
    // for node or entity
    // we need to iterate on offsetSet and offsets2CharsMap
    Set<Long> allOffsets = new TreeSet<Long>();
    allOffsets.addAll(offsetsSet);
    allOffsets.addAll(offsets2CharsMap.keySet());
    Iterator<Long> allOffsetsIterator = allOffsets.iterator();
    while (allOffsetsIterator.hasNext()) {
        Long nextOffset = allOffsetsIterator.next();
        int nextOffsetint = nextOffset.intValue();
        // is there some text to add since last time?
        if (nextOffsetint > lastCharactercopied) {
            modifiedBuffer.append(textWithNodes.substring(lastCharactercopied, nextOffsetint));
            lastCharactercopied = nextOffsetint;
        }
        // do we need to add a node information here?
        if (offsetsSet.contains(nextOffset))
            modifiedBuffer.append("<Node id=\"").append(nextOffsetint).append("\"/>");
        // do we need to convert an XML entity?
        if (offsets2CharsMap.containsKey(nextOffset)) {
            String entityString = entitiesMap.get(offsets2CharsMap.get(nextOffset));
            // skip the character in the original String
            lastCharactercopied++;
            // append the corresponding entity
            modifiedBuffer.append(entityString);
        }
    }
    // copies the remaining text
    modifiedBuffer.append(textWithNodes.substring(lastCharactercopied, textWithNodes.length()));
    return modifiedBuffer.toString();
}
Also used : AnnotationSet(gate.AnnotationSet) TreeMap(java.util.TreeMap) Annotation(gate.Annotation) TreeSet(java.util.TreeSet)

Example 22 with AnnotationSet

use of gate.AnnotationSet in project gate-core by GateNLP.

the class DocumentImpl method writeEmptyTag.

// writeEmptyTag
/**
 * Returns a string representing an empty tag based on the input annot
 */
private String writeEmptyTag(Annotation annot, boolean includeNamespace) {
    // Get the annot feature used to store the namespace prefix, if it
    // has been defined
    String nsPrefix = null;
    if (serializeNamespaceInfo)
        nsPrefix = (String) annot.getFeatures().get(namespacePrefixFeature);
    StringBuffer strBuff = new StringBuffer("");
    if (annot == null)
        return strBuff.toString();
    strBuff.append("<");
    if (nsPrefix != null && !nsPrefix.isEmpty())
        strBuff.append(nsPrefix + ":");
    strBuff.append(annot.getType());
    AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    if (!originalMarkupsAnnotSet.contains(annot)) {
        strBuff.append(" gateId=\"");
        strBuff.append(annot.getId());
        strBuff.append("\"");
    }
    strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
    strBuff.append("/>");
    return strBuff.toString();
}
Also used : AnnotationSet(gate.AnnotationSet)

Example 23 with AnnotationSet

use of gate.AnnotationSet in project gate-core by GateNLP.

the class DocumentImpl method saveAnnotationSetAsXmlInOrig.

// hasOriginalContentFeatures
/**
 * This method saves all the annotations from aDumpAnnotSet and combines them
 * with the original document content, if preserved as feature.
 *
 * @param aSourceAnnotationSet
 *          is a GATE annotation set prepared to be used on the raw text from
 *          document content. If aDumpAnnotSet is <b>null<b> then an empty
 *          string will be returned.
 * @param includeFeatures
 *          is a boolean, which controls whether the annotation features and
 *          gate ID are included or not.
 * @return The XML document obtained from raw text + the information from the
 *         dump annotation set.
 */
private String saveAnnotationSetAsXmlInOrig(Set<Annotation> aSourceAnnotationSet, boolean includeFeatures) {
    StringBuffer docContStrBuff;
    String origContent;
    origContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
    if (origContent == null) {
        origContent = "";
    }
    // if
    long originalContentSize = origContent.length();
    RepositioningInfo repositioning = (RepositioningInfo) getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
    docContStrBuff = new StringBuffer(origContent);
    if (aSourceAnnotationSet == null)
        return docContStrBuff.toString();
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    // Create a dumping annotation set on the document. It will be used for
    // dumping annotations...
    AnnotationSet dumpingSet = new AnnotationSetImpl(this);
    if (sListener != null)
        sListener.statusChanged("Constructing the dumping annotation set.");
    // Then take all the annotations from aSourceAnnotationSet and verify if
    // they can be inserted safely into the dumpingSet. Where not possible,
    // report.
    Iterator<Annotation> iter = aSourceAnnotationSet.iterator();
    Annotation currentAnnot;
    while (iter.hasNext()) {
        currentAnnot = iter.next();
        if (insertsSafety(originalMarkupsAnnotSet, currentAnnot) && insertsSafety(dumpingSet, currentAnnot)) {
            dumpingSet.add(currentAnnot);
        } else {
            Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() + ", startOffset=" + currentAnnot.getStartNode().getOffset() + ", endOffset=" + currentAnnot.getEndNode().getOffset() + ", type=" + currentAnnot.getType() + " was found to violate the" + " crossed over condition. It will be discarded");
        }
    // End if
    }
    // Here we go.
    if (sListener != null)
        sListener.statusChanged("Dumping annotations as XML");
    // /////////////////////////////////////////
    // Construct a set of annot with all IDs in asc order.
    // All annotations that end at that offset swap their place in descending
    // order. For each node write all the tags from left to right.
    // Construct the node set
    TreeSet<Long> offsets = new TreeSet<Long>();
    iter = aSourceAnnotationSet.iterator();
    while (iter.hasNext()) {
        Annotation annot = iter.next();
        offsets.add(annot.getStartNode().getOffset());
        offsets.add(annot.getEndNode().getOffset());
    }
    // iteration
    while (!offsets.isEmpty()) {
        Long offset = offsets.last();
        // Remove the offset from the set
        offsets.remove(offset);
        // Now, use it.
        // Returns a list with annotations that needs to be serialized in that
        // offset.
        List<Annotation> annotations = getAnnotationsForOffset(aSourceAnnotationSet, offset);
        // Attention: the annotation are serialized from left to right
        StringBuffer tmpBuff = new StringBuffer("");
        Stack<Annotation> stack = new Stack<Annotation>();
        // Iterate through all these annotations and serialize them
        Iterator<Annotation> it = annotations.iterator();
        Annotation a = null;
        while (it.hasNext()) {
            a = it.next();
            it.remove();
            // Test if a Ends at offset
            if (offset.equals(a.getEndNode().getOffset())) {
                // Test if a Starts at offset
                if (offset.equals(a.getStartNode().getOffset())) {
                    // Here, the annotation a Starts and Ends at the offset
                    if (null != a.getFeatures().get("isEmptyAndSpan") && "true".equals(a.getFeatures().get("isEmptyAndSpan"))) {
                        // Assert: annotation a with start == end and isEmptyAndSpan
                        tmpBuff.append(writeStartTag(a, includeFeatures, false));
                        stack.push(a);
                    } else {
                        // Assert annotation a with start == end and an empty tag
                        tmpBuff.append(writeEmptyTag(a, false));
                        // The annotation is removed from dumped set
                        aSourceAnnotationSet.remove(a);
                    }
                // End if
                } else {
                    // In this case empty the stack and write the end tag
                    while (!stack.isEmpty()) {
                        Annotation a1 = stack.pop();
                        tmpBuff.append(writeEndTag(a1));
                    }
                    // End while
                    tmpBuff.append(writeEndTag(a));
                }
            // End if
            } else {
                // at the offset
                if (offset.equals(a.getStartNode().getOffset())) {
                    // In this case empty the stack and write the end tag
                    while (!stack.isEmpty()) {
                        Annotation a1 = stack.pop();
                        tmpBuff.append(writeEndTag(a1));
                    }
                    // End while
                    tmpBuff.append(writeStartTag(a, includeFeatures, false));
                    // The annotation is removed from dumped set
                    aSourceAnnotationSet.remove(a);
                }
            // End if ( offset.equals(a.getStartNode().getOffset()) )
            }
        // End if ( offset.equals(a.getEndNode().getOffset()) )
        }
        // In this case empty the stack and write the end tag
        while (!stack.isEmpty()) {
            Annotation a1 = stack.pop();
            tmpBuff.append(writeEndTag(a1));
        }
        // End while
        long originalPosition = -1;
        boolean backPositioning = a != null && offset.equals(a.getEndNode().getOffset());
        if (backPositioning) {
            // end of the annotation correction
            originalPosition = repositioning.getOriginalPos(offset.intValue(), true);
        }
        // if
        if (originalPosition == -1) {
            originalPosition = repositioning.getOriginalPos(offset.intValue());
        }
        // Insert tmpBuff to the location where it belongs in docContStrBuff
        if (originalPosition != -1 && originalPosition <= originalContentSize) {
            docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
        } else {
            Out.prln("Error in the repositioning. The offset (" + offset.intValue() + ") could not be positioned in the original document. \n" + "Calculated position is: " + originalPosition + " placed back: " + backPositioning);
        }
    // if
    }
    // End while(!offsets.isEmpty())
    if (theRootAnnotation != null)
        docContStrBuff.append(writeEndTag(theRootAnnotation));
    return docContStrBuff.toString();
}
Also used : AnnotationSet(gate.AnnotationSet) Annotation(gate.Annotation) Stack(java.util.Stack) AnnotationSetImpl(gate.annotation.AnnotationSetImpl) TreeSet(java.util.TreeSet) StatusListener(gate.event.StatusListener)

Example 24 with AnnotationSet

use of gate.AnnotationSet in project gate-core by GateNLP.

the class DocumentImpl method toXml.

/**
 * Returns an XML document aming to preserve the original markups( the
 * original markup will be in the same place and format as it was before
 * processing the document) and include (if possible) the annotations
 * specified in the aSourceAnnotationSet. <b>Warning:</b> Annotations from
 * the aSourceAnnotationSet will be lost if they will cause a crosed over
 * situation.
 *
 * @param aSourceAnnotationSet
 *          is an annotation set containing all the annotations that will be
 *          combined with the original marup set. If the param is
 *          <code>null</code> it will only dump the original markups.
 * @param includeFeatures
 *          is a boolean that controls whether the annotation features should
 *          be included or not. If false, only the annotation type is included
 *          in the tag.
 * @return a string representing an XML document containing the original
 *         markup + dumped annotations form the aSourceAnnotationSet
 */
@Override
@SuppressWarnings("unused")
public String toXml(Set<Annotation> aSourceAnnotationSet, boolean includeFeatures) {
    if (hasOriginalContentFeatures()) {
        return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet, // if
        includeFeatures);
    }
    AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    // Create a dumping annotation set on the document. It will be used for
    // dumping annotations...
    // AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
    List<Annotation> dumpingList = new ArrayList<Annotation>(originalMarkupsAnnotSet.size());
    // This set will be constructed inside this method. If is not empty, the
    // annotation contained will be lost.
    /*
     * if (!dumpingSet.isEmpty()){ Out.prln("WARNING: The dumping annotation set
     * was not empty."+ "All annotation it contained were lost.");
     * dumpingSet.clear(); }// End if
     */
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    // First add all annotation from the original markups
    if (sListener != null)
        sListener.statusChanged("Constructing the dumping annotation set.");
    // dumpingSet.addAll(originalMarkupsAnnotSet);
    dumpingList.addAll(originalMarkupsAnnotSet);
    // report.
    if (aSourceAnnotationSet != null) {
        Iterator<Annotation> iter = aSourceAnnotationSet.iterator();
        while (iter.hasNext()) {
            Annotation currentAnnot = iter.next();
            if (insertsSafety(dumpingList, currentAnnot)) {
                // dumpingSet.add(currentAnnot);
                dumpingList.add(currentAnnot);
            } else if (crossedOverAnnotation != null && DEBUG) {
                try {
                    Out.prln("Warning: Annotations were found to violate the " + "crossed over condition: \n" + "1. [" + getContent().getContent(crossedOverAnnotation.getStartNode().getOffset(), crossedOverAnnotation.getEndNode().getOffset()) + " (" + crossedOverAnnotation.getType() + ": " + crossedOverAnnotation.getStartNode().getOffset() + ";" + crossedOverAnnotation.getEndNode().getOffset() + ")]\n" + "2. [" + getContent().getContent(currentAnnot.getStartNode().getOffset(), currentAnnot.getEndNode().getOffset()) + " (" + currentAnnot.getType() + ": " + currentAnnot.getStartNode().getOffset() + ";" + currentAnnot.getEndNode().getOffset() + ")]\nThe second one will be discarded.\n");
                } catch (gate.util.InvalidOffsetException ex) {
                    throw new GateRuntimeException(ex.getMessage());
                }
            }
        // End if
        }
    // End while
    }
    // End if
    // kalina: order the dumping list by start offset
    Collections.sort(dumpingList, new gate.util.OffsetComparator());
    // Here we go.
    if (sListener != null)
        sListener.statusChanged("Dumping annotations as XML");
    StringBuffer xmlDoc = new StringBuffer(DocumentXmlUtils.DOC_SIZE_MULTIPLICATION_FACTOR * (this.getContent().size().intValue()));
    // Add xml header if original format was xml
    String mimeType = (String) getFeatures().get("MimeType");
    boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
    if (wasXML) {
        xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
        xmlDoc.append(getEncoding());
        xmlDoc.append("\" ?>");
        xmlDoc.append(Strings.getNl());
    }
    // ENd if
    // Identify and extract the root annotation from the dumpingSet.
    theRootAnnotation = identifyTheRootAnnotation(dumpingList);
    // beginning of the document
    if (theRootAnnotation != null) {
        dumpingList.remove(theRootAnnotation);
        xmlDoc.append(writeStartTag(theRootAnnotation, includeFeatures));
    }
    // End if
    // Construct and append the rest of the document
    xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures));
    // end of the document
    if (theRootAnnotation != null) {
        xmlDoc.append(writeEndTag(theRootAnnotation));
    }
    // End if
    if (sListener != null)
        sListener.statusChanged("Done.");
    return xmlDoc.toString();
}
Also used : ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation) GateRuntimeException(gate.util.GateRuntimeException) StatusListener(gate.event.StatusListener)

Example 25 with AnnotationSet

use of gate.AnnotationSet in project gate-core by GateNLP.

the class AnnotationSetImpl method readObject.

private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException {
    this.longestAnnot = 0l;
    ObjectInputStream.GetField gf = in.readFields();
    this.name = (String) gf.get("name", null);
    this.doc = (DocumentImpl) gf.get("doc", null);
    boolean isIndexedByType = false;
    boolean isIndexedByStartNode = false;
    this.annotations = (Annotation[]) gf.get("annotations", null);
    if (this.annotations == null) {
        // old style serialised version
        @SuppressWarnings("unchecked") Map<Integer, Annotation> annotsByIdMap = (Map<Integer, Annotation>) gf.get("annotsById", null);
        if (annotsByIdMap == null)
            throw new IOException("Invalid serialised data: neither annotations array or map by id" + " are present.");
        annotations = annotsByIdMap.values().toArray(new Annotation[] {});
    } else {
        // new style serialised version
        isIndexedByType = in.readBoolean();
        isIndexedByStartNode = in.readBoolean();
    }
    // this.name = (String)in.readObject();
    // this.doc = (DocumentImpl)in.readObject();
    // Annotation[] annotations = (Annotation[])in.readObject();
    // do we need to create the indices?
    // boolean isIndexedByType = in.readBoolean();
    // boolean isIndexedByStartNode = in.readBoolean();
    this.annotsById = new HashMap<Integer, Annotation>(annotations.length);
    // rebuilds the indices if required
    if (isIndexedByType) {
        annotsByType = new HashMap<String, AnnotationSet>(Gate.HASH_STH_SIZE);
    }
    if (isIndexedByStartNode) {
        nodesByOffset = new RBTreeMap<Long, Node>();
        annotsByStartNode = new HashMap<Integer, Object>(annotations.length);
    }
    // add all the annotations one by one
    for (int i = 0; i < annotations.length; i++) {
        add(annotations[i]);
    }
    this.relations = (RelationSet) gf.get("relations", null);
    annotations = null;
}
Also used : Node(gate.Node) AnnotationSet(gate.AnnotationSet) IOException(java.io.IOException) Annotation(gate.Annotation) HashMap(java.util.HashMap) Map(java.util.Map) FeatureMap(gate.FeatureMap) RBTreeMap(gate.util.RBTreeMap) ObjectInputStream(java.io.ObjectInputStream)

Aggregations

AnnotationSet (gate.AnnotationSet)43 Annotation (gate.Annotation)27 ArrayList (java.util.ArrayList)14 HashMap (java.util.HashMap)11 HashSet (java.util.HashSet)11 Document (gate.Document)9 List (java.util.List)8 FeatureMap (gate.FeatureMap)7 InvalidOffsetException (gate.util.InvalidOffsetException)6 AnnotationSetImpl (gate.annotation.AnnotationSetImpl)5 Set (java.util.Set)5 StatusListener (gate.event.StatusListener)4 GateRuntimeException (gate.util.GateRuntimeException)4 Point (java.awt.Point)4 IOException (java.io.IOException)4 URL (java.net.URL)4 Map (java.util.Map)4 Color (java.awt.Color)3 TreeSet (java.util.TreeSet)3 TestDocument (gate.corpora.TestDocument)2