use of gate.Annotation in project gate-core by GateNLP.
the class DocumentStaxUtils method writeXcesAnnotations.
/**
* Save annotations to the given XMLStreamWriter in XCES format. The
* writer is <i>not</i> closed by this method, that is left to the
* caller. This method writes just the cesAna element - the XML
* declaration must be filled in by the caller if required. Characters
* in feature values that are illegal in XML are replaced by
* {@link #INVALID_CHARACTER_REPLACEMENT} (a space). Feature <i>names</i>
* are not modified, nor are annotation types - an illegal character
* in one of these will cause the serialization to fail.
*
* @param annotations the annotations to save, typically an
* AnnotationSet
* @param xsw the XMLStreamWriter to write to
* @param includeId should we include the annotation IDs (as the "n"
* attribute on each <code>struct</code>)?
* @throws XMLStreamException
*/
public static void writeXcesAnnotations(Collection<Annotation> annotations, XMLStreamWriter xsw, boolean includeId) throws XMLStreamException {
List<Annotation> annotsToDump = new ArrayList<Annotation>(annotations);
Collections.sort(annotsToDump, LONGEST_FIRST_OFFSET_COMPARATOR);
xsw.setDefaultNamespace(XCES_NAMESPACE);
xsw.writeStartElement(XCES_NAMESPACE, "cesAna");
xsw.writeDefaultNamespace(XCES_NAMESPACE);
xsw.writeAttribute("version", XCES_VERSION);
newLine(xsw);
String indent = " ";
String indentMore = indent + indent;
for (Annotation a : annotsToDump) {
long start = a.getStartNode().getOffset().longValue();
long end = a.getEndNode().getOffset().longValue();
FeatureMap fm = a.getFeatures();
xsw.writeCharacters(indent);
if (fm == null || fm.size() == 0) {
xsw.writeEmptyElement(XCES_NAMESPACE, "struct");
} else {
xsw.writeStartElement(XCES_NAMESPACE, "struct");
}
xsw.writeAttribute("type", a.getType());
xsw.writeAttribute("from", String.valueOf(start));
xsw.writeAttribute("to", String.valueOf(end));
// include the annotation ID as the "n" attribute if requested
if (includeId) {
xsw.writeAttribute("n", String.valueOf(a.getId()));
}
newLine(xsw);
if (fm != null && fm.size() != 0) {
for (Map.Entry<Object, Object> att : fm.entrySet()) {
if (!"isEmptyAndSpan".equals(att.getKey())) {
xsw.writeCharacters(indentMore);
xsw.writeEmptyElement(XCES_NAMESPACE, "feat");
xsw.writeAttribute("name", String.valueOf(att.getKey()));
xsw.writeAttribute("value", replaceXMLIllegalCharactersInString(String.valueOf(att.getValue())));
newLine(xsw);
}
}
xsw.writeCharacters(indent);
xsw.writeEndElement();
newLine(xsw);
}
}
xsw.writeEndElement();
newLine(xsw);
}
use of gate.Annotation in project gate-core by GateNLP.
the class DocumentStaxUtils method writeTextWithNodes.
/**
* Writes the content of the given document to an XMLStreamWriter as a
* mixed content element called "TextWithNodes". At each point where
* there is the start or end of an annotation in any annotation set on
* the document, a "Node" element is written with an "id" feature
* whose value is the offset of that node.
*
* @param doc the document whose content is to be written
* @param annotationSets the annotations for which nodes are required.
* This is a collection of collections.
* @param xsw the {@link XMLStreamWriter} to write to.
* @param namespaceURI the namespace URI. May be empty but may not be
* null.
* @throws XMLStreamException
*/
public static void writeTextWithNodes(Document doc, Collection<Collection<Annotation>> annotationSets, XMLStreamWriter xsw, String namespaceURI) throws XMLStreamException {
String aText = doc.getContent().toString();
// no text, so return an empty element
if (aText == null) {
xsw.writeEmptyElement(namespaceURI, "TextWithNodes");
return;
}
// build a set of all the offsets where Nodes are required
TreeSet<Long> offsetsSet = new TreeSet<Long>();
if (annotationSets != null) {
for (Collection<Annotation> set : annotationSets) {
if (set != null) {
for (Annotation annot : set) {
offsetsSet.add(annot.getStartNode().getOffset());
offsetsSet.add(annot.getEndNode().getOffset());
}
}
}
}
// write the TextWithNodes element
char[] textArray = aText.toCharArray();
xsw.writeStartElement(namespaceURI, "TextWithNodes");
int lastNodeOffset = 0;
// offsetsSet iterator is in ascending order of offset, as it is a
// SortedSet
Iterator<Long> offsetsIterator = offsetsSet.iterator();
while (offsetsIterator.hasNext()) {
int offset = offsetsIterator.next().intValue();
// write characters since the last node output
// replace XML-illegal characters in this slice of text - we
// have to do this here rather than on the text as a whole in
// case the node falls between the two halves of a surrogate
// pair (in which case both halves are illegal and must be
// replaced).
replaceXMLIllegalCharacters(textArray, lastNodeOffset, offset - lastNodeOffset);
writeCharactersOrCDATA(xsw, new String(textArray, lastNodeOffset, offset - lastNodeOffset));
xsw.writeEmptyElement(namespaceURI, "Node");
xsw.writeAttribute("id", String.valueOf(offset));
lastNodeOffset = offset;
}
// write any remaining text after the last node
replaceXMLIllegalCharacters(textArray, lastNodeOffset, textArray.length - lastNodeOffset);
writeCharactersOrCDATA(xsw, new String(textArray, lastNodeOffset, textArray.length - lastNodeOffset));
// and the closing TextWithNodes
xsw.writeEndElement();
}
use of gate.Annotation in project gate-core by GateNLP.
the class EmailDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format (e.g. EMAIL) into annotations in GATE format.
* Uses the markupElementsMap to determine which elements to convert, and
* what annotation type names to use.
* It always tryes to parse te doc's content. It doesn't matter if the
* sourceUrl is null or not.
*
* @param doc The gate document you want to parse.
*/
@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
setNewLineProperty(doc);
// create an EmailDocumentHandler
EmailDocumentHandler emailDocHandler = null;
emailDocHandler = new gate.email.EmailDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// this is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
// Register a status listener with it
emailDocHandler.addStatusListener(statusListener);
try {
// Call the method that creates annotations on the gate document
emailDocHandler.annotateMessages();
// Process the body annotations and search for paragraphs
AnnotationSet bodyAnnotations = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
if (bodyAnnotations != null && !bodyAnnotations.isEmpty()) {
Iterator<Annotation> iter = bodyAnnotations.iterator();
while (iter.hasNext()) {
Annotation a = iter.next();
annotateParagraphs(doc, a.getStartNode().getOffset().intValue(), a.getEndNode().getOffset().intValue(), GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
}
// End while
}
// End if
} catch (IOException e) {
throw new DocumentFormatException("Couldn't create a buffered reader ", e);
} catch (InvalidOffsetException e) {
throw new DocumentFormatException(e);
} finally {
emailDocHandler.removeStatusListener(statusListener);
}
// End try
}
use of gate.Annotation in project gate-core by GateNLP.
the class UimaDocumentFormat method unpackCasMarkup.
/**
* Convert UIMA CAS markups to GATE markups.
* @param doc XML document already parsed
* @throws DocumentFormatException error when parsing the file
*/
private void unpackCasMarkup(Document doc) throws DocumentFormatException {
AnnotationSet inputAS = doc.getAnnotations("Original markups");
AnnotationSet outputAS = doc.getAnnotations("Original markups");
// set format specific names
String casPrefix;
String idName;
if (!inputAS.get("CAS").isEmpty()) {
casPrefix = "uima.cas.";
idName = "_id";
} else if (!inputAS.get("xmi:XMI").isEmpty()) {
casPrefix = "cas:";
idName = "xmi:id";
} else {
throw new DocumentFormatException("The document \"" + doc.getName() + "\" is neither of XCAS nor XMICAS format.");
}
// get array/list contained elements annotations
for (Annotation annotation : inputAS) {
if (annotation.getType().matches(casPrefix + "[a-zA-Z]+(List|Array)")) {
try {
String elements = doc.getContent().getContent(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()).toString();
// add contained values as a feature to the array annotation
if (!elements.trim().equals("")) {
annotation.getFeatures().put("elements", elements);
}
} catch (InvalidOffsetException e) {
throw new DocumentFormatException(e);
}
}
}
// get document content from SOFA annotations
Set<Annotation> sofaSet = inputAS.get(casPrefix + "Sofa");
if (sofaSet.size() > 1) {
Out.prln("More than one UIMA SOFA, annotation offsets won't be correct.");
}
StringBuilder documentContent = new StringBuilder();
for (Annotation annotation : sofaSet) {
documentContent.append((String) annotation.getFeatures().get("sofaString"));
}
doc.setContent(new DocumentContentImpl(documentContent.toString()));
// remove SOFA annotations
inputAS.removeAll(sofaSet);
// remove non document annotations
inputAS.removeAll(inputAS.get("CAS"));
inputAS.removeAll(inputAS.get("xmi:XMI"));
inputAS.removeAll(inputAS.get("cas:NULL"));
// get the views members, views will be added later as annotation sets
List<List<String>> viewList = new ArrayList<List<String>>();
for (Annotation view : inputAS.get(casPrefix + "View")) {
viewList.add(Arrays.asList(((String) view.getFeatures().get("members")).split("\\s+")));
}
inputAS.removeAll(inputAS.get(casPrefix + "View"));
// fill a map with the id as key and the entity name as value
// this is specific to the Temis Luxid CAS format
Map<String, String> entityMap = new HashMap<String, String>();
for (Annotation entity : inputAS.get("com.temis.uima.Entity")) {
FeatureMap features = entity.getFeatures();
entityMap.put((String) features.get(idName), (String) features.get("value"));
}
try {
// for each UIMA annotation
for (Annotation annotation : new HashSet<Annotation>(inputAS)) {
FeatureMap features = Factory.newFeatureMap();
features.putAll(annotation.getFeatures());
String start = (String) features.get("begin");
String end = (String) features.get("end");
String id = (String) features.get(idName);
// UIMA feature
features.remove("begin");
// UIMA feature
features.remove("end");
// GATE feature
features.remove("isEmptyAndSpan");
// UIMA XCAS feature
features.remove("_indexed");
if (start == null || end == null) {
// no offsets so add it as a GATE document feature
features.remove(idName);
for (Map.Entry<Object, Object> entry : features.entrySet()) {
doc.getFeatures().put(annotation.getType() + '_' + id + '.' + entry.getKey(), entry.getValue());
}
} else {
// offsets so add it as a GATE document annotation
String entityReference = (String) features.get("_ref_entity");
String type = entityMap.containsKey(entityReference) ? entityMap.get(entityReference) : annotation.getType();
Integer gateId = outputAS.add(Long.valueOf(start), Long.valueOf(end), type, features);
int viewCount = 0;
for (List<String> viewMembers : viewList) {
if (viewMembers.contains(id)) {
// add the annotation to the annotation set
doc.getAnnotations("CasView" + viewCount).add(outputAS.get(gateId));
}
viewCount++;
}
}
// delete UIMA annotation
inputAS.remove(annotation);
}
} catch (InvalidOffsetException e) {
throw new DocumentFormatException("Couldn't create annotation.", e);
}
}
Aggregations