use of gate.AnnotationSet in project gate-core by GateNLP.
the class DocumentXmlUtils method textWithNodes.
// replaceCharsWithEntities()
/**
* Returns the document's text interspersed with <Node> elements at all
* points where the document has an annotation beginning or ending.
*/
public static String textWithNodes(TextualDocument doc, String aText) {
// getoffsets for XML entities
if (aText == null)
return new String("");
StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
// Construct a map from offsets to Chars ()
SortedMap<Long, Character> offsets2CharsMap = new TreeMap<Long, Character>();
if (aText.length() != 0) {
// Fill the offsets2CharsMap with all the indices where special chars
// appear
buildEntityMapFromString(aText, offsets2CharsMap);
}
// End if
// Construct the offsetsSet for all nodes belonging to this document
SortedSet<Long> offsetsSet = new TreeSet<Long>();
Iterator<Annotation> annotSetIter = doc.getAnnotations().iterator();
while (annotSetIter.hasNext()) {
Annotation annot = annotSetIter.next();
offsetsSet.add(annot.getStartNode().getOffset());
offsetsSet.add(annot.getEndNode().getOffset());
}
// end While
// Get the nodes from all other named annotation sets.
Map<String, AnnotationSet> namedAnnotSets = doc.getNamedAnnotationSets();
if (namedAnnotSets != null) {
Iterator<AnnotationSet> iter = namedAnnotSets.values().iterator();
while (iter.hasNext()) {
AnnotationSet annotSet = iter.next();
Iterator<Annotation> iter2 = annotSet.iterator();
while (iter2.hasNext()) {
Annotation annotTmp = iter2.next();
offsetsSet.add(annotTmp.getStartNode().getOffset());
offsetsSet.add(annotTmp.getEndNode().getOffset());
}
// End while
}
// End while
}
// is a TreeSet
if (offsetsSet.isEmpty()) {
return replaceCharsWithEntities(aText).toString();
}
// create a large StringBuffer
StringBuffer modifiedBuffer = new StringBuffer(textWithNodes.length() * 2);
// last character copied from the original String
int lastCharactercopied = 0;
// append to buffer all text up to next offset
// for node or entity
// we need to iterate on offsetSet and offsets2CharsMap
Set<Long> allOffsets = new TreeSet<Long>();
allOffsets.addAll(offsetsSet);
allOffsets.addAll(offsets2CharsMap.keySet());
Iterator<Long> allOffsetsIterator = allOffsets.iterator();
while (allOffsetsIterator.hasNext()) {
Long nextOffset = allOffsetsIterator.next();
int nextOffsetint = nextOffset.intValue();
// is there some text to add since last time?
if (nextOffsetint > lastCharactercopied) {
modifiedBuffer.append(textWithNodes.substring(lastCharactercopied, nextOffsetint));
lastCharactercopied = nextOffsetint;
}
// do we need to add a node information here?
if (offsetsSet.contains(nextOffset))
modifiedBuffer.append("<Node id=\"").append(nextOffsetint).append("\"/>");
// do we need to convert an XML entity?
if (offsets2CharsMap.containsKey(nextOffset)) {
String entityString = entitiesMap.get(offsets2CharsMap.get(nextOffset));
// skip the character in the original String
lastCharactercopied++;
// append the corresponding entity
modifiedBuffer.append(entityString);
}
}
// copies the remaining text
modifiedBuffer.append(textWithNodes.substring(lastCharactercopied, textWithNodes.length()));
return modifiedBuffer.toString();
}
use of gate.AnnotationSet in project gate-core by GateNLP.
the class DocumentImpl method writeEmptyTag.
// writeEmptyTag
/**
* Returns a string representing an empty tag based on the input annot
*/
private String writeEmptyTag(Annotation annot, boolean includeNamespace) {
// Get the annot feature used to store the namespace prefix, if it
// has been defined
String nsPrefix = null;
if (serializeNamespaceInfo)
nsPrefix = (String) annot.getFeatures().get(namespacePrefixFeature);
StringBuffer strBuff = new StringBuffer("");
if (annot == null)
return strBuff.toString();
strBuff.append("<");
if (nsPrefix != null && !nsPrefix.isEmpty())
strBuff.append(nsPrefix + ":");
strBuff.append(annot.getType());
AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
if (!originalMarkupsAnnotSet.contains(annot)) {
strBuff.append(" gateId=\"");
strBuff.append(annot.getId());
strBuff.append("\"");
}
strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
strBuff.append("/>");
return strBuff.toString();
}
use of gate.AnnotationSet in project gate-core by GateNLP.
the class DocumentImpl method saveAnnotationSetAsXmlInOrig.
// hasOriginalContentFeatures
/**
* This method saves all the annotations from aDumpAnnotSet and combines them
* with the original document content, if preserved as feature.
*
* @param aSourceAnnotationSet
* is a GATE annotation set prepared to be used on the raw text from
* document content. If aDumpAnnotSet is <b>null<b> then an empty
* string will be returned.
* @param includeFeatures
* is a boolean, which controls whether the annotation features and
* gate ID are included or not.
* @return The XML document obtained from raw text + the information from the
* dump annotation set.
*/
private String saveAnnotationSetAsXmlInOrig(Set<Annotation> aSourceAnnotationSet, boolean includeFeatures) {
StringBuffer docContStrBuff;
String origContent;
origContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
if (origContent == null) {
origContent = "";
}
// if
long originalContentSize = origContent.length();
RepositioningInfo repositioning = (RepositioningInfo) getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
docContStrBuff = new StringBuffer(origContent);
if (aSourceAnnotationSet == null)
return docContStrBuff.toString();
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// Create a dumping annotation set on the document. It will be used for
// dumping annotations...
AnnotationSet dumpingSet = new AnnotationSetImpl(this);
if (sListener != null)
sListener.statusChanged("Constructing the dumping annotation set.");
// Then take all the annotations from aSourceAnnotationSet and verify if
// they can be inserted safely into the dumpingSet. Where not possible,
// report.
Iterator<Annotation> iter = aSourceAnnotationSet.iterator();
Annotation currentAnnot;
while (iter.hasNext()) {
currentAnnot = iter.next();
if (insertsSafety(originalMarkupsAnnotSet, currentAnnot) && insertsSafety(dumpingSet, currentAnnot)) {
dumpingSet.add(currentAnnot);
} else {
Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() + ", startOffset=" + currentAnnot.getStartNode().getOffset() + ", endOffset=" + currentAnnot.getEndNode().getOffset() + ", type=" + currentAnnot.getType() + " was found to violate the" + " crossed over condition. It will be discarded");
}
// End if
}
// Here we go.
if (sListener != null)
sListener.statusChanged("Dumping annotations as XML");
// /////////////////////////////////////////
// Construct a set of annot with all IDs in asc order.
// All annotations that end at that offset swap their place in descending
// order. For each node write all the tags from left to right.
// Construct the node set
TreeSet<Long> offsets = new TreeSet<Long>();
iter = aSourceAnnotationSet.iterator();
while (iter.hasNext()) {
Annotation annot = iter.next();
offsets.add(annot.getStartNode().getOffset());
offsets.add(annot.getEndNode().getOffset());
}
// iteration
while (!offsets.isEmpty()) {
Long offset = offsets.last();
// Remove the offset from the set
offsets.remove(offset);
// Now, use it.
// Returns a list with annotations that needs to be serialized in that
// offset.
List<Annotation> annotations = getAnnotationsForOffset(aSourceAnnotationSet, offset);
// Attention: the annotation are serialized from left to right
StringBuffer tmpBuff = new StringBuffer("");
Stack<Annotation> stack = new Stack<Annotation>();
// Iterate through all these annotations and serialize them
Iterator<Annotation> it = annotations.iterator();
Annotation a = null;
while (it.hasNext()) {
a = it.next();
it.remove();
// Test if a Ends at offset
if (offset.equals(a.getEndNode().getOffset())) {
// Test if a Starts at offset
if (offset.equals(a.getStartNode().getOffset())) {
// Here, the annotation a Starts and Ends at the offset
if (null != a.getFeatures().get("isEmptyAndSpan") && "true".equals(a.getFeatures().get("isEmptyAndSpan"))) {
// Assert: annotation a with start == end and isEmptyAndSpan
tmpBuff.append(writeStartTag(a, includeFeatures, false));
stack.push(a);
} else {
// Assert annotation a with start == end and an empty tag
tmpBuff.append(writeEmptyTag(a, false));
// The annotation is removed from dumped set
aSourceAnnotationSet.remove(a);
}
// End if
} else {
// In this case empty the stack and write the end tag
while (!stack.isEmpty()) {
Annotation a1 = stack.pop();
tmpBuff.append(writeEndTag(a1));
}
// End while
tmpBuff.append(writeEndTag(a));
}
// End if
} else {
// at the offset
if (offset.equals(a.getStartNode().getOffset())) {
// In this case empty the stack and write the end tag
while (!stack.isEmpty()) {
Annotation a1 = stack.pop();
tmpBuff.append(writeEndTag(a1));
}
// End while
tmpBuff.append(writeStartTag(a, includeFeatures, false));
// The annotation is removed from dumped set
aSourceAnnotationSet.remove(a);
}
// End if ( offset.equals(a.getStartNode().getOffset()) )
}
// End if ( offset.equals(a.getEndNode().getOffset()) )
}
// In this case empty the stack and write the end tag
while (!stack.isEmpty()) {
Annotation a1 = stack.pop();
tmpBuff.append(writeEndTag(a1));
}
// End while
long originalPosition = -1;
boolean backPositioning = a != null && offset.equals(a.getEndNode().getOffset());
if (backPositioning) {
// end of the annotation correction
originalPosition = repositioning.getOriginalPos(offset.intValue(), true);
}
// if
if (originalPosition == -1) {
originalPosition = repositioning.getOriginalPos(offset.intValue());
}
// Insert tmpBuff to the location where it belongs in docContStrBuff
if (originalPosition != -1 && originalPosition <= originalContentSize) {
docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
} else {
Out.prln("Error in the repositioning. The offset (" + offset.intValue() + ") could not be positioned in the original document. \n" + "Calculated position is: " + originalPosition + " placed back: " + backPositioning);
}
// if
}
// End while(!offsets.isEmpty())
if (theRootAnnotation != null)
docContStrBuff.append(writeEndTag(theRootAnnotation));
return docContStrBuff.toString();
}
use of gate.AnnotationSet in project gate-core by GateNLP.
the class DocumentImpl method toXml.
/**
* Returns an XML document aming to preserve the original markups( the
* original markup will be in the same place and format as it was before
* processing the document) and include (if possible) the annotations
* specified in the aSourceAnnotationSet. <b>Warning:</b> Annotations from
* the aSourceAnnotationSet will be lost if they will cause a crosed over
* situation.
*
* @param aSourceAnnotationSet
* is an annotation set containing all the annotations that will be
* combined with the original marup set. If the param is
* <code>null</code> it will only dump the original markups.
* @param includeFeatures
* is a boolean that controls whether the annotation features should
* be included or not. If false, only the annotation type is included
* in the tag.
* @return a string representing an XML document containing the original
* markup + dumped annotations form the aSourceAnnotationSet
*/
@Override
@SuppressWarnings("unused")
public String toXml(Set<Annotation> aSourceAnnotationSet, boolean includeFeatures) {
if (hasOriginalContentFeatures()) {
return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet, // if
includeFeatures);
}
AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
// Create a dumping annotation set on the document. It will be used for
// dumping annotations...
// AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
List<Annotation> dumpingList = new ArrayList<Annotation>(originalMarkupsAnnotSet.size());
// This set will be constructed inside this method. If is not empty, the
// annotation contained will be lost.
/*
* if (!dumpingSet.isEmpty()){ Out.prln("WARNING: The dumping annotation set
* was not empty."+ "All annotation it contained were lost.");
* dumpingSet.clear(); }// End if
*/
StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
// First add all annotation from the original markups
if (sListener != null)
sListener.statusChanged("Constructing the dumping annotation set.");
// dumpingSet.addAll(originalMarkupsAnnotSet);
dumpingList.addAll(originalMarkupsAnnotSet);
// report.
if (aSourceAnnotationSet != null) {
Iterator<Annotation> iter = aSourceAnnotationSet.iterator();
while (iter.hasNext()) {
Annotation currentAnnot = iter.next();
if (insertsSafety(dumpingList, currentAnnot)) {
// dumpingSet.add(currentAnnot);
dumpingList.add(currentAnnot);
} else if (crossedOverAnnotation != null && DEBUG) {
try {
Out.prln("Warning: Annotations were found to violate the " + "crossed over condition: \n" + "1. [" + getContent().getContent(crossedOverAnnotation.getStartNode().getOffset(), crossedOverAnnotation.getEndNode().getOffset()) + " (" + crossedOverAnnotation.getType() + ": " + crossedOverAnnotation.getStartNode().getOffset() + ";" + crossedOverAnnotation.getEndNode().getOffset() + ")]\n" + "2. [" + getContent().getContent(currentAnnot.getStartNode().getOffset(), currentAnnot.getEndNode().getOffset()) + " (" + currentAnnot.getType() + ": " + currentAnnot.getStartNode().getOffset() + ";" + currentAnnot.getEndNode().getOffset() + ")]\nThe second one will be discarded.\n");
} catch (gate.util.InvalidOffsetException ex) {
throw new GateRuntimeException(ex.getMessage());
}
}
// End if
}
// End while
}
// End if
// kalina: order the dumping list by start offset
Collections.sort(dumpingList, new gate.util.OffsetComparator());
// Here we go.
if (sListener != null)
sListener.statusChanged("Dumping annotations as XML");
StringBuffer xmlDoc = new StringBuffer(DocumentXmlUtils.DOC_SIZE_MULTIPLICATION_FACTOR * (this.getContent().size().intValue()));
// Add xml header if original format was xml
String mimeType = (String) getFeatures().get("MimeType");
boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
if (wasXML) {
xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
xmlDoc.append(getEncoding());
xmlDoc.append("\" ?>");
xmlDoc.append(Strings.getNl());
}
// ENd if
// Identify and extract the root annotation from the dumpingSet.
theRootAnnotation = identifyTheRootAnnotation(dumpingList);
// beginning of the document
if (theRootAnnotation != null) {
dumpingList.remove(theRootAnnotation);
xmlDoc.append(writeStartTag(theRootAnnotation, includeFeatures));
}
// End if
// Construct and append the rest of the document
xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures));
// end of the document
if (theRootAnnotation != null) {
xmlDoc.append(writeEndTag(theRootAnnotation));
}
// End if
if (sListener != null)
sListener.statusChanged("Done.");
return xmlDoc.toString();
}
use of gate.AnnotationSet in project gate-core by GateNLP.
the class AnnotationSetImpl method readObject.
private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException {
this.longestAnnot = 0l;
ObjectInputStream.GetField gf = in.readFields();
this.name = (String) gf.get("name", null);
this.doc = (DocumentImpl) gf.get("doc", null);
boolean isIndexedByType = false;
boolean isIndexedByStartNode = false;
this.annotations = (Annotation[]) gf.get("annotations", null);
if (this.annotations == null) {
// old style serialised version
@SuppressWarnings("unchecked") Map<Integer, Annotation> annotsByIdMap = (Map<Integer, Annotation>) gf.get("annotsById", null);
if (annotsByIdMap == null)
throw new IOException("Invalid serialised data: neither annotations array or map by id" + " are present.");
annotations = annotsByIdMap.values().toArray(new Annotation[] {});
} else {
// new style serialised version
isIndexedByType = in.readBoolean();
isIndexedByStartNode = in.readBoolean();
}
// this.name = (String)in.readObject();
// this.doc = (DocumentImpl)in.readObject();
// Annotation[] annotations = (Annotation[])in.readObject();
// do we need to create the indices?
// boolean isIndexedByType = in.readBoolean();
// boolean isIndexedByStartNode = in.readBoolean();
this.annotsById = new HashMap<Integer, Annotation>(annotations.length);
// rebuilds the indices if required
if (isIndexedByType) {
annotsByType = new HashMap<String, AnnotationSet>(Gate.HASH_STH_SIZE);
}
if (isIndexedByStartNode) {
nodesByOffset = new RBTreeMap<Long, Node>();
annotsByStartNode = new HashMap<Integer, Object>(annotations.length);
}
// add all the annotations one by one
for (int i = 0; i < annotations.length; i++) {
add(annotations[i]);
}
this.relations = (RelationSet) gf.get("relations", null);
annotations = null;
}
Aggregations