use of gate.Annotation in project gate-core by GateNLP.
the class DocumentImpl method identifyTheRootAnnotation.
// writeStartTag()
/**
* Identifies the root annotations inside an annotation set. The root
* annotation is the one that starts at offset 0, and has the greatest span.
* If there are more than one with this function, then the annotation with the
* smalled ID wil be selected as root. If none is identified it will return
* null.
*
* @param anAnnotationSet
* The annotation set possibly containing the root annotation.
* @return The root annotation or null is it fails
*/
@SuppressWarnings("unused")
private Annotation identifyTheRootAnnotation(AnnotationSet anAnnotationSet) {
if (anAnnotationSet == null)
return null;
// If the starting node of this annotation is not null, then the annotation
// set will not have a root annotation.
Node startNode = anAnnotationSet.firstNode();
Node endNode = anAnnotationSet.lastNode();
// offset equal to 0.
if (startNode.getOffset().longValue() != 0)
return null;
// Go anf find the annotation.
Annotation theRootAnnotation = null;
// Check if there are annotations starting at offset 0. If there are, then
// check all of them to see which one has the greatest span. Basically its
// END offset should be the bigest offset from the input annotation set.
long start = startNode.getOffset().longValue();
long end = endNode.getOffset().longValue();
for (Iterator<Annotation> it = anAnnotationSet.iterator(); it.hasNext(); ) {
Annotation currentAnnot = it.next();
// end of the AnnotationSet then check to see if its ID is the smallest.
if ((start == currentAnnot.getStartNode().getOffset().longValue()) && (end == currentAnnot.getEndNode().getOffset().longValue())) {
// The currentAnnotation has is a potencial root one.
if (theRootAnnotation == null)
theRootAnnotation = currentAnnot;
else {
// If its ID is greater that the currentAnnot then update the root
if (theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
theRootAnnotation = currentAnnot;
}
// End if
}
// End if
}
// End for
return theRootAnnotation;
}
use of gate.Annotation in project gate-core by GateNLP.
the class DocumentImpl method saveAnnotationSetAsXmlInOrig.
// hasOriginalContentFeatures
/**
* This method saves all the annotations from aDumpAnnotSet and combines them
* with the original document content, if preserved as feature.
*
* @param aSourceAnnotationSet
* is a GATE annotation set prepared to be used on the raw text from
* document content. If aDumpAnnotSet is <b>null<b> then an empty
* string will be returned.
* @param includeFeatures
* is a boolean, which controls whether the annotation features and
* gate ID are included or not.
* @return The XML document obtained from raw text + the information from the
* dump annotation set.
*/
private String saveAnnotationSetAsXmlInOrig(Set<Annotation> aSourceAnnotationSet, boolean includeFeatures) {
StringBuffer docContStrBuff;
String origContent;
origContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
if (origContent == null) {
origContent = "";
}
// if
long originalContentSize = origContent.length();
RepositioningInfo repositioning = (RepositioningInfo) getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
docContStrBuff = new StringBuffer(origContent);
if (aSourceAnnotationSet == null)
return docContStrBuff.toString();
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// Create a dumping annotation set on the document. It will be used for
// dumping annotations...
AnnotationSet dumpingSet = new AnnotationSetImpl(this);
if (sListener != null)
sListener.statusChanged("Constructing the dumping annotation set.");
// Then take all the annotations from aSourceAnnotationSet and verify if
// they can be inserted safely into the dumpingSet. Where not possible,
// report.
Iterator<Annotation> iter = aSourceAnnotationSet.iterator();
Annotation currentAnnot;
while (iter.hasNext()) {
currentAnnot = iter.next();
if (insertsSafety(originalMarkupsAnnotSet, currentAnnot) && insertsSafety(dumpingSet, currentAnnot)) {
dumpingSet.add(currentAnnot);
} else {
Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() + ", startOffset=" + currentAnnot.getStartNode().getOffset() + ", endOffset=" + currentAnnot.getEndNode().getOffset() + ", type=" + currentAnnot.getType() + " was found to violate the" + " crossed over condition. It will be discarded");
}
// End if
}
// Here we go.
if (sListener != null)
sListener.statusChanged("Dumping annotations as XML");
// /////////////////////////////////////////
// Construct a set of annot with all IDs in asc order.
// All annotations that end at that offset swap their place in descending
// order. For each node write all the tags from left to right.
// Construct the node set
TreeSet<Long> offsets = new TreeSet<Long>();
iter = aSourceAnnotationSet.iterator();
while (iter.hasNext()) {
Annotation annot = iter.next();
offsets.add(annot.getStartNode().getOffset());
offsets.add(annot.getEndNode().getOffset());
}
// iteration
while (!offsets.isEmpty()) {
Long offset = offsets.last();
// Remove the offset from the set
offsets.remove(offset);
// Now, use it.
// Returns a list with annotations that needs to be serialized in that
// offset.
List<Annotation> annotations = getAnnotationsForOffset(aSourceAnnotationSet, offset);
// Attention: the annotation are serialized from left to right
StringBuffer tmpBuff = new StringBuffer("");
Stack<Annotation> stack = new Stack<Annotation>();
// Iterate through all these annotations and serialize them
Iterator<Annotation> it = annotations.iterator();
Annotation a = null;
while (it.hasNext()) {
a = it.next();
it.remove();
// Test if a Ends at offset
if (offset.equals(a.getEndNode().getOffset())) {
// Test if a Starts at offset
if (offset.equals(a.getStartNode().getOffset())) {
// Here, the annotation a Starts and Ends at the offset
if (null != a.getFeatures().get("isEmptyAndSpan") && "true".equals(a.getFeatures().get("isEmptyAndSpan"))) {
// Assert: annotation a with start == end and isEmptyAndSpan
tmpBuff.append(writeStartTag(a, includeFeatures, false));
stack.push(a);
} else {
// Assert annotation a with start == end and an empty tag
tmpBuff.append(writeEmptyTag(a, false));
// The annotation is removed from dumped set
aSourceAnnotationSet.remove(a);
}
// End if
} else {
// In this case empty the stack and write the end tag
while (!stack.isEmpty()) {
Annotation a1 = stack.pop();
tmpBuff.append(writeEndTag(a1));
}
// End while
tmpBuff.append(writeEndTag(a));
}
// End if
} else {
// at the offset
if (offset.equals(a.getStartNode().getOffset())) {
// In this case empty the stack and write the end tag
while (!stack.isEmpty()) {
Annotation a1 = stack.pop();
tmpBuff.append(writeEndTag(a1));
}
// End while
tmpBuff.append(writeStartTag(a, includeFeatures, false));
// The annotation is removed from dumped set
aSourceAnnotationSet.remove(a);
}
// End if ( offset.equals(a.getStartNode().getOffset()) )
}
// End if ( offset.equals(a.getEndNode().getOffset()) )
}
// In this case empty the stack and write the end tag
while (!stack.isEmpty()) {
Annotation a1 = stack.pop();
tmpBuff.append(writeEndTag(a1));
}
// End while
long originalPosition = -1;
boolean backPositioning = a != null && offset.equals(a.getEndNode().getOffset());
if (backPositioning) {
// end of the annotation correction
originalPosition = repositioning.getOriginalPos(offset.intValue(), true);
}
// if
if (originalPosition == -1) {
originalPosition = repositioning.getOriginalPos(offset.intValue());
}
// Insert tmpBuff to the location where it belongs in docContStrBuff
if (originalPosition != -1 && originalPosition <= originalContentSize) {
docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
} else {
Out.prln("Error in the repositioning. The offset (" + offset.intValue() + ") could not be positioned in the original document. \n" + "Calculated position is: " + originalPosition + " placed back: " + backPositioning);
}
// if
}
// End while(!offsets.isEmpty())
if (theRootAnnotation != null)
docContStrBuff.append(writeEndTag(theRootAnnotation));
return docContStrBuff.toString();
}
use of gate.Annotation in project gate-core by GateNLP.
the class DocumentImpl method getAnnotationsForOffset.
// getAnnotationsForOffset()
private List<Annotation> getAnnotationsForOffset(List<Annotation> aDumpAnnotList, Long offset) {
List<Annotation> annotationList = new ArrayList<Annotation>();
if (aDumpAnnotList == null || offset == null)
return annotationList;
Set<Annotation> annotThatStartAtOffset;
Set<Annotation> annotThatEndAtOffset;
Set<Annotation> annotThatStartAndEndAtOffset;
annotThatStartAtOffset = new TreeSet<Annotation>(new AnnotationComparator(ORDER_ON_END_OFFSET, DESC));
annotThatEndAtOffset = new TreeSet<Annotation>(new AnnotationComparator(ORDER_ON_START_OFFSET, DESC));
annotThatStartAndEndAtOffset = new TreeSet<Annotation>(new AnnotationComparator(ORDER_ON_ANNOT_ID, ASC));
// Fill these tree lists with annotation tat start, end or start and
// end at the offset.
Iterator<Annotation> iter = aDumpAnnotList.iterator();
while (iter.hasNext()) {
Annotation ann = iter.next();
if (offset.equals(ann.getStartNode().getOffset())) {
if (offset.equals(ann.getEndNode().getOffset()))
annotThatStartAndEndAtOffset.add(ann);
else
annotThatStartAtOffset.add(ann);
} else {
if (offset.equals(ann.getEndNode().getOffset()))
annotThatEndAtOffset.add(ann);
}
// End if
}
// End while
annotationList.addAll(annotThatEndAtOffset);
annotationList.addAll(annotThatStartAtOffset);
annotThatEndAtOffset = null;
annotThatStartAtOffset = null;
iter = annotThatStartAndEndAtOffset.iterator();
while (iter.hasNext()) {
Annotation ann = iter.next();
Iterator<Annotation> it = annotationList.iterator();
boolean breaked = false;
while (it.hasNext()) {
Annotation annFromList = it.next();
if (annFromList.getId().intValue() > ann.getId().intValue()) {
annotationList.add(annotationList.indexOf(annFromList), ann);
breaked = true;
break;
}
// End if
}
// End while
if (!breaked)
annotationList.add(ann);
iter.remove();
}
// End while
return annotationList;
}
use of gate.Annotation in project gate-core by GateNLP.
the class DocumentImpl method toXml.
/**
* Returns an XML document aming to preserve the original markups( the
* original markup will be in the same place and format as it was before
* processing the document) and include (if possible) the annotations
* specified in the aSourceAnnotationSet. <b>Warning:</b> Annotations from
* the aSourceAnnotationSet will be lost if they will cause a crosed over
* situation.
*
* @param aSourceAnnotationSet
* is an annotation set containing all the annotations that will be
* combined with the original marup set. If the param is
* <code>null</code> it will only dump the original markups.
* @param includeFeatures
* is a boolean that controls whether the annotation features should
* be included or not. If false, only the annotation type is included
* in the tag.
* @return a string representing an XML document containing the original
* markup + dumped annotations form the aSourceAnnotationSet
*/
@Override
@SuppressWarnings("unused")
public String toXml(Set<Annotation> aSourceAnnotationSet, boolean includeFeatures) {
if (hasOriginalContentFeatures()) {
return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet, // if
includeFeatures);
}
AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// Create a dumping annotation set on the document. It will be used for
// dumping annotations...
// AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
List<Annotation> dumpingList = new ArrayList<Annotation>(originalMarkupsAnnotSet.size());
// This set will be constructed inside this method. If is not empty, the
// annotation contained will be lost.
/*
* if (!dumpingSet.isEmpty()){ Out.prln("WARNING: The dumping annotation set
* was not empty."+ "All annotation it contained were lost.");
* dumpingSet.clear(); }// End if
*/
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
// First add all annotation from the original markups
if (sListener != null)
sListener.statusChanged("Constructing the dumping annotation set.");
// dumpingSet.addAll(originalMarkupsAnnotSet);
dumpingList.addAll(originalMarkupsAnnotSet);
// report.
if (aSourceAnnotationSet != null) {
Iterator<Annotation> iter = aSourceAnnotationSet.iterator();
while (iter.hasNext()) {
Annotation currentAnnot = iter.next();
if (insertsSafety(dumpingList, currentAnnot)) {
// dumpingSet.add(currentAnnot);
dumpingList.add(currentAnnot);
} else if (crossedOverAnnotation != null && DEBUG) {
try {
Out.prln("Warning: Annotations were found to violate the " + "crossed over condition: \n" + "1. [" + getContent().getContent(crossedOverAnnotation.getStartNode().getOffset(), crossedOverAnnotation.getEndNode().getOffset()) + " (" + crossedOverAnnotation.getType() + ": " + crossedOverAnnotation.getStartNode().getOffset() + ";" + crossedOverAnnotation.getEndNode().getOffset() + ")]\n" + "2. [" + getContent().getContent(currentAnnot.getStartNode().getOffset(), currentAnnot.getEndNode().getOffset()) + " (" + currentAnnot.getType() + ": " + currentAnnot.getStartNode().getOffset() + ";" + currentAnnot.getEndNode().getOffset() + ")]\nThe second one will be discarded.\n");
} catch (gate.util.InvalidOffsetException ex) {
throw new GateRuntimeException(ex.getMessage());
}
}
// End if
}
// End while
}
// End if
// kalina: order the dumping list by start offset
Collections.sort(dumpingList, new gate.util.OffsetComparator());
// Here we go.
if (sListener != null)
sListener.statusChanged("Dumping annotations as XML");
StringBuffer xmlDoc = new StringBuffer(DocumentXmlUtils.DOC_SIZE_MULTIPLICATION_FACTOR * (this.getContent().size().intValue()));
// Add xml header if original format was xml
String mimeType = (String) getFeatures().get("MimeType");
boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
if (wasXML) {
xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
xmlDoc.append(getEncoding());
xmlDoc.append("\" ?>");
xmlDoc.append(Strings.getNl());
}
// ENd if
// Identify and extract the root annotation from the dumpingSet.
theRootAnnotation = identifyTheRootAnnotation(dumpingList);
// beginning of the document
if (theRootAnnotation != null) {
dumpingList.remove(theRootAnnotation);
xmlDoc.append(writeStartTag(theRootAnnotation, includeFeatures));
}
// End if
// Construct and append the rest of the document
xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures));
// end of the document
if (theRootAnnotation != null) {
xmlDoc.append(writeEndTag(theRootAnnotation));
}
// End if
if (sListener != null)
sListener.statusChanged("Done.");
return xmlDoc.toString();
}
use of gate.Annotation in project gate-core by GateNLP.
the class DocumentImpl method saveAnnotationSetAsXml.
// saveAnnotationSetAsXml()
private String saveAnnotationSetAsXml(List<Annotation> aDumpAnnotList, boolean includeFeatures) {
String content;
if (this.getContent() == null)
content = "";
else
content = this.getContent().toString();
StringBuffer docContStrBuff = DocumentXmlUtils.filterNonXmlChars(new StringBuffer(content));
if (aDumpAnnotList == null)
return docContStrBuff.toString();
StringBuffer resultStrBuff = new StringBuffer(DOC_SIZE_MULTIPLICATION_FACTOR_AS * (this.getContent().size().intValue()));
// last offset position used to extract portions of text
Long lastOffset = 0L;
TreeMap<Long, Character> offsets2CharsMap = new TreeMap<Long, Character>();
HashMap<Long, List<Annotation>> annotsForOffset = new HashMap<Long, List<Annotation>>(100);
if (this.getContent().size() != 0) {
// Fill the offsets2CharsMap with all the indices where
// special chars appear
buildEntityMapFromString(content, offsets2CharsMap);
}
// End if
// The saving alghorithm is as follows:
// /////////////////////////////////////////
// Construct a set of annot with all IDs in asc order.
// All annotations that end at that offset swap their place in descending
// order. For each node write all the tags from left to right.
// Construct the node set
TreeSet<Long> offsets = new TreeSet<Long>();
Iterator<Annotation> iter = aDumpAnnotList.iterator();
Annotation annot;
Long start;
Long end;
while (iter.hasNext()) {
annot = iter.next();
start = annot.getStartNode().getOffset();
end = annot.getEndNode().getOffset();
offsets.add(start);
offsets.add(end);
if (annotsForOffset.containsKey(start)) {
annotsForOffset.get(start).add(annot);
} else {
List<Annotation> newList = new ArrayList<Annotation>(10);
newList.add(annot);
annotsForOffset.put(start, newList);
}
if (annotsForOffset.containsKey(end)) {
annotsForOffset.get(end).add(annot);
} else {
List<Annotation> newList = new ArrayList<Annotation>(10);
newList.add(annot);
annotsForOffset.put(end, newList);
}
}
// End while
// ofsets is sorted in ascending order.
// Iterate this set in descending order and remove an offset at each
// iteration
Iterator<Long> offsetIt = offsets.iterator();
Long offset;
List<Annotation> annotations;
// This don't have to be a large buffer - just for tags
StringBuffer tmpBuff = new StringBuffer(255);
Stack<Annotation> stack = new Stack<Annotation>();
while (offsetIt.hasNext()) {
offset = offsetIt.next();
// Now, use it.
// Returns a list with annotations that needs to be serialized in that
// offset.
annotations = annotsForOffset.get(offset);
// order annotations in list for offset to print tags in correct order
annotations = getAnnotationsForOffset(annotations, offset);
// clear structures
tmpBuff.setLength(0);
stack.clear();
// Iterate through all these annotations and serialize them
Iterator<Annotation> it = annotations.iterator();
Annotation a;
Annotation annStack;
while (it.hasNext()) {
a = it.next();
// Test if a Ends at offset
if (offset.equals(a.getEndNode().getOffset())) {
// Test if a Starts at offset
if (offset.equals(a.getStartNode().getOffset())) {
// Here, the annotation a Starts and Ends at the offset
if (null != a.getFeatures().get("isEmptyAndSpan") && "true".equals(a.getFeatures().get("isEmptyAndSpan"))) {
// Assert: annotation a with start == end and isEmptyAndSpan
tmpBuff.append(writeStartTag(a, includeFeatures));
stack.push(a);
} else {
// Assert annotation a with start == end and an empty tag
tmpBuff.append(writeEmptyTag(a));
// The annotation is removed from dumped set
aDumpAnnotList.remove(a);
}
// End if
} else {
// In this case empty the stack and write the end tag
if (!stack.isEmpty()) {
while (!stack.isEmpty()) {
annStack = stack.pop();
tmpBuff.append(writeEndTag(annStack));
}
// End while
}
// End if
tmpBuff.append(writeEndTag(a));
}
// End if
} else {
// at the offset
if (offset.equals(a.getStartNode().getOffset())) {
// In this case empty the stack and write the end tag
if (!stack.isEmpty()) {
while (!stack.isEmpty()) {
annStack = stack.pop();
tmpBuff.append(writeEndTag(annStack));
}
// End while
}
// End if
tmpBuff.append(writeStartTag(a, includeFeatures));
// The annotation is removed from dumped set
}
// End if ( offset.equals(a.getStartNode().getOffset()) )
}
// End if ( offset.equals(a.getEndNode().getOffset()) )
}
// In this case empty the stack and write the end tag
if (!stack.isEmpty()) {
while (!stack.isEmpty()) {
annStack = stack.pop();
tmpBuff.append(writeEndTag(annStack));
}
// End while
}
// End if
// extract text from content and replace spec chars
StringBuffer partText = new StringBuffer();
SortedMap<Long, Character> offsetsInRange = offsets2CharsMap.subMap(lastOffset, offset);
Long tmpOffset;
Long tmpLastOffset = lastOffset;
String replacement;
// if there are chars to be replaced in range
while (!offsetsInRange.isEmpty()) {
tmpOffset = offsetsInRange.firstKey();
replacement = DocumentXmlUtils.entitiesMap.get(offsets2CharsMap.get(tmpOffset));
partText.append(docContStrBuff.substring(tmpLastOffset.intValue(), tmpOffset.intValue()));
partText.append(replacement);
tmpLastOffset = tmpOffset + 1;
offsetsInRange.remove(tmpOffset);
}
partText.append(docContStrBuff.substring(tmpLastOffset.intValue(), offset.intValue()));
resultStrBuff.append(partText);
// Insert tmpBuff to the result string
resultStrBuff.append(tmpBuff.toString());
lastOffset = offset;
}
// End while(!offsets.isEmpty())
// get text to the end of content
// extract text from content and replace spec chars
StringBuffer partText = new StringBuffer();
SortedMap<Long, Character> offsetsInRange = offsets2CharsMap.subMap(lastOffset, (long) docContStrBuff.length());
Long tmpOffset;
Long tmpLastOffset = lastOffset;
String replacement;
// they need to be replaced
while (!offsetsInRange.isEmpty()) {
tmpOffset = offsetsInRange.firstKey();
replacement = DocumentXmlUtils.entitiesMap.get(offsets2CharsMap.get(tmpOffset));
partText.append(docContStrBuff.substring(tmpLastOffset.intValue(), tmpOffset.intValue()));
partText.append(replacement);
tmpLastOffset = tmpOffset + 1;
offsetsInRange.remove(tmpOffset);
}
partText.append(docContStrBuff.substring(tmpLastOffset.intValue(), docContStrBuff.length()));
resultStrBuff.append(partText);
return resultStrBuff.toString();
}
Aggregations