use of gate.AnnotationSet in project gate-core by GateNLP.
the class EmailDocumentFormat method unpackMarkup.
/**
* Unpack the markup in the document. This converts markup from the
* native format (e.g. EMAIL) into annotations in GATE format.
* Uses the markupElementsMap to determine which elements to convert, and
* what annotation type names to use.
* It always tryes to parse te doc's content. It doesn't matter if the
* sourceUrl is null or not.
*
* @param doc The gate document you want to parse.
*/
@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
}
// End if
setNewLineProperty(doc);
// create an EmailDocumentHandler
EmailDocumentHandler emailDocHandler = null;
emailDocHandler = new gate.email.EmailDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
StatusListener statusListener = new StatusListener() {
@Override
public void statusChanged(String text) {
// this is implemented in DocumentFormat.java and inherited here
fireStatusChanged(text);
}
};
// Register a status listener with it
emailDocHandler.addStatusListener(statusListener);
try {
// Call the method that creates annotations on the gate document
emailDocHandler.annotateMessages();
// Process the body annotations and search for paragraphs
AnnotationSet bodyAnnotations = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
if (bodyAnnotations != null && !bodyAnnotations.isEmpty()) {
Iterator<Annotation> iter = bodyAnnotations.iterator();
while (iter.hasNext()) {
Annotation a = iter.next();
annotateParagraphs(doc, a.getStartNode().getOffset().intValue(), a.getEndNode().getOffset().intValue(), GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
}
// End while
}
// End if
} catch (IOException e) {
throw new DocumentFormatException("Couldn't create a buffered reader ", e);
} catch (InvalidOffsetException e) {
throw new DocumentFormatException(e);
} finally {
emailDocHandler.removeStatusListener(statusListener);
}
// End try
}
use of gate.AnnotationSet in project gate-core by GateNLP.
the class UimaDocumentFormat method unpackCasMarkup.
/**
* Convert UIMA CAS markups to GATE markups.
* @param doc XML document already parsed
* @throws DocumentFormatException error when parsing the file
*/
private void unpackCasMarkup(Document doc) throws DocumentFormatException {
AnnotationSet inputAS = doc.getAnnotations("Original markups");
AnnotationSet outputAS = doc.getAnnotations("Original markups");
// set format specific names
String casPrefix;
String idName;
if (!inputAS.get("CAS").isEmpty()) {
casPrefix = "uima.cas.";
idName = "_id";
} else if (!inputAS.get("xmi:XMI").isEmpty()) {
casPrefix = "cas:";
idName = "xmi:id";
} else {
throw new DocumentFormatException("The document \"" + doc.getName() + "\" is neither of XCAS nor XMICAS format.");
}
// get array/list contained elements annotations
for (Annotation annotation : inputAS) {
if (annotation.getType().matches(casPrefix + "[a-zA-Z]+(List|Array)")) {
try {
String elements = doc.getContent().getContent(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()).toString();
// add contained values as a feature to the array annotation
if (!elements.trim().equals("")) {
annotation.getFeatures().put("elements", elements);
}
} catch (InvalidOffsetException e) {
throw new DocumentFormatException(e);
}
}
}
// get document content from SOFA annotations
Set<Annotation> sofaSet = inputAS.get(casPrefix + "Sofa");
if (sofaSet.size() > 1) {
Out.prln("More than one UIMA SOFA, annotation offsets won't be correct.");
}
StringBuilder documentContent = new StringBuilder();
for (Annotation annotation : sofaSet) {
documentContent.append((String) annotation.getFeatures().get("sofaString"));
}
doc.setContent(new DocumentContentImpl(documentContent.toString()));
// remove SOFA annotations
inputAS.removeAll(sofaSet);
// remove non document annotations
inputAS.removeAll(inputAS.get("CAS"));
inputAS.removeAll(inputAS.get("xmi:XMI"));
inputAS.removeAll(inputAS.get("cas:NULL"));
// get the views members, views will be added later as annotation sets
List<List<String>> viewList = new ArrayList<List<String>>();
for (Annotation view : inputAS.get(casPrefix + "View")) {
viewList.add(Arrays.asList(((String) view.getFeatures().get("members")).split("\\s+")));
}
inputAS.removeAll(inputAS.get(casPrefix + "View"));
// fill a map with the id as key and the entity name as value
// this is specific to the Temis Luxid CAS format
Map<String, String> entityMap = new HashMap<String, String>();
for (Annotation entity : inputAS.get("com.temis.uima.Entity")) {
FeatureMap features = entity.getFeatures();
entityMap.put((String) features.get(idName), (String) features.get("value"));
}
try {
// for each UIMA annotation
for (Annotation annotation : new HashSet<Annotation>(inputAS)) {
FeatureMap features = Factory.newFeatureMap();
features.putAll(annotation.getFeatures());
String start = (String) features.get("begin");
String end = (String) features.get("end");
String id = (String) features.get(idName);
// UIMA feature
features.remove("begin");
// UIMA feature
features.remove("end");
// GATE feature
features.remove("isEmptyAndSpan");
// UIMA XCAS feature
features.remove("_indexed");
if (start == null || end == null) {
// no offsets so add it as a GATE document feature
features.remove(idName);
for (Map.Entry<Object, Object> entry : features.entrySet()) {
doc.getFeatures().put(annotation.getType() + '_' + id + '.' + entry.getKey(), entry.getValue());
}
} else {
// offsets so add it as a GATE document annotation
String entityReference = (String) features.get("_ref_entity");
String type = entityMap.containsKey(entityReference) ? entityMap.get(entityReference) : annotation.getType();
Integer gateId = outputAS.add(Long.valueOf(start), Long.valueOf(end), type, features);
int viewCount = 0;
for (List<String> viewMembers : viewList) {
if (viewMembers.contains(id)) {
// add the annotation to the annotation set
doc.getAnnotations("CasView" + viewCount).add(outputAS.get(gateId));
}
viewCount++;
}
}
// delete UIMA annotation
inputAS.remove(annotation);
}
} catch (InvalidOffsetException e) {
throw new DocumentFormatException("Couldn't create annotation.", e);
}
}
use of gate.AnnotationSet in project gate-core by GateNLP.
the class InlineXMLExporter method export.
@Override
public void export(Document doc, OutputStream out, FeatureMap options) throws IOException {
Integer rootID = null;
AnnotationSet withRoot = null;
AnnotationSet originalMarkups = null;
AnnotationSet backupOriginalMarkups = null;
try {
AnnotationSet allAnnots = doc.getAnnotations((String) options.get("annotationSetName"));
if (!(Boolean) options.get("includeOriginalMarkups")) {
originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
backupOriginalMarkups = new AnnotationSetImpl(originalMarkups);
originalMarkups.clear();
}
// first transfer the annotation types from a list to a set
@SuppressWarnings("unchecked") Set<String> types2Export = new HashSet<String>((List<String>) options.get("annotationTypes"));
// then get the annotations for export
AnnotationSet annots2Export = allAnnots.get(types2Export);
withRoot = new AnnotationSetImpl(doc);
withRoot.addAll(annots2Export);
String rootType = (String) options.get("rootElement");
if (rootType != null && !"".equals(rootType)) {
// add the root element to the set
rootID = withRoot.add(0L, doc.getContent().size(), (String) options.get("rootElement"), Factory.newFeatureMap());
}
// create a writer using the specified encoding
OutputStreamWriter writer = new OutputStreamWriter(out, (String) options.get("encoding"));
// write the document
writer.write(doc.toXml(withRoot, (Boolean) options.get("includeFeatures")));
// make sure it gets written
writer.flush();
} catch (InvalidOffsetException e) {
throw new IOException(e);
} finally {
// delete the fake root element
if (rootID != null)
withRoot.remove(withRoot.get(rootID));
// restore the original markups
if (backupOriginalMarkups != null)
originalMarkups.addAll(backupOriginalMarkups);
}
}
Aggregations