Search in sources :

Example 41 with AnnotationSet

use of gate.AnnotationSet in project gate-core by GateNLP.

the class EmailDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format (e.g. EMAIL) into annotations in GATE format.
 * Uses the markupElementsMap to determine which elements to convert, and
 * what annotation type names to use.
 * It always tryes to parse te doc's content. It doesn't matter if the
 * sourceUrl is null or not.
 *
 * @param doc The gate document you want to parse.
 */
@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    setNewLineProperty(doc);
    // create an EmailDocumentHandler
    EmailDocumentHandler emailDocHandler = null;
    emailDocHandler = new gate.email.EmailDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // this is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    // Register a status listener with it
    emailDocHandler.addStatusListener(statusListener);
    try {
        // Call the method that creates annotations on the gate document
        emailDocHandler.annotateMessages();
        // Process the body annotations and search for paragraphs
        AnnotationSet bodyAnnotations = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
        if (bodyAnnotations != null && !bodyAnnotations.isEmpty()) {
            Iterator<Annotation> iter = bodyAnnotations.iterator();
            while (iter.hasNext()) {
                Annotation a = iter.next();
                annotateParagraphs(doc, a.getStartNode().getOffset().intValue(), a.getEndNode().getOffset().intValue(), GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
            }
        // End while
        }
    // End if
    } catch (IOException e) {
        throw new DocumentFormatException("Couldn't create a buffered reader ", e);
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException(e);
    } finally {
        emailDocHandler.removeStatusListener(statusListener);
    }
// End try
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) EmailDocumentHandler(gate.email.EmailDocumentHandler) EmailDocumentHandler(gate.email.EmailDocumentHandler) AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) StatusListener(gate.event.StatusListener) IOException(java.io.IOException) Annotation(gate.Annotation)

Example 42 with AnnotationSet

use of gate.AnnotationSet in project gate-core by GateNLP.

the class UimaDocumentFormat method unpackCasMarkup.

/**
 * Convert UIMA CAS markups to GATE markups.
 * @param doc XML document already parsed
 * @throws DocumentFormatException error when parsing the file
 */
private void unpackCasMarkup(Document doc) throws DocumentFormatException {
    AnnotationSet inputAS = doc.getAnnotations("Original markups");
    AnnotationSet outputAS = doc.getAnnotations("Original markups");
    // set format specific names
    String casPrefix;
    String idName;
    if (!inputAS.get("CAS").isEmpty()) {
        casPrefix = "uima.cas.";
        idName = "_id";
    } else if (!inputAS.get("xmi:XMI").isEmpty()) {
        casPrefix = "cas:";
        idName = "xmi:id";
    } else {
        throw new DocumentFormatException("The document \"" + doc.getName() + "\" is neither of XCAS nor XMICAS format.");
    }
    // get array/list contained elements annotations
    for (Annotation annotation : inputAS) {
        if (annotation.getType().matches(casPrefix + "[a-zA-Z]+(List|Array)")) {
            try {
                String elements = doc.getContent().getContent(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()).toString();
                // add contained values as a feature to the array annotation
                if (!elements.trim().equals("")) {
                    annotation.getFeatures().put("elements", elements);
                }
            } catch (InvalidOffsetException e) {
                throw new DocumentFormatException(e);
            }
        }
    }
    // get document content from SOFA annotations
    Set<Annotation> sofaSet = inputAS.get(casPrefix + "Sofa");
    if (sofaSet.size() > 1) {
        Out.prln("More than one UIMA SOFA, annotation offsets won't be correct.");
    }
    StringBuilder documentContent = new StringBuilder();
    for (Annotation annotation : sofaSet) {
        documentContent.append((String) annotation.getFeatures().get("sofaString"));
    }
    doc.setContent(new DocumentContentImpl(documentContent.toString()));
    // remove SOFA annotations
    inputAS.removeAll(sofaSet);
    // remove non document annotations
    inputAS.removeAll(inputAS.get("CAS"));
    inputAS.removeAll(inputAS.get("xmi:XMI"));
    inputAS.removeAll(inputAS.get("cas:NULL"));
    // get the views members, views will be added later as annotation sets
    List<List<String>> viewList = new ArrayList<List<String>>();
    for (Annotation view : inputAS.get(casPrefix + "View")) {
        viewList.add(Arrays.asList(((String) view.getFeatures().get("members")).split("\\s+")));
    }
    inputAS.removeAll(inputAS.get(casPrefix + "View"));
    // fill a map with the id as key and the entity name as value
    // this is specific to the Temis Luxid CAS format
    Map<String, String> entityMap = new HashMap<String, String>();
    for (Annotation entity : inputAS.get("com.temis.uima.Entity")) {
        FeatureMap features = entity.getFeatures();
        entityMap.put((String) features.get(idName), (String) features.get("value"));
    }
    try {
        // for each UIMA annotation
        for (Annotation annotation : new HashSet<Annotation>(inputAS)) {
            FeatureMap features = Factory.newFeatureMap();
            features.putAll(annotation.getFeatures());
            String start = (String) features.get("begin");
            String end = (String) features.get("end");
            String id = (String) features.get(idName);
            // UIMA feature
            features.remove("begin");
            // UIMA feature
            features.remove("end");
            // GATE feature
            features.remove("isEmptyAndSpan");
            // UIMA XCAS feature
            features.remove("_indexed");
            if (start == null || end == null) {
                // no offsets so add it as a GATE document feature
                features.remove(idName);
                for (Map.Entry<Object, Object> entry : features.entrySet()) {
                    doc.getFeatures().put(annotation.getType() + '_' + id + '.' + entry.getKey(), entry.getValue());
                }
            } else {
                // offsets so add it as a GATE document annotation
                String entityReference = (String) features.get("_ref_entity");
                String type = entityMap.containsKey(entityReference) ? entityMap.get(entityReference) : annotation.getType();
                Integer gateId = outputAS.add(Long.valueOf(start), Long.valueOf(end), type, features);
                int viewCount = 0;
                for (List<String> viewMembers : viewList) {
                    if (viewMembers.contains(id)) {
                        // add the annotation to the annotation set
                        doc.getAnnotations("CasView" + viewCount).add(outputAS.get(gateId));
                    }
                    viewCount++;
                }
            }
            // delete UIMA annotation
            inputAS.remove(annotation);
        }
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException("Couldn't create annotation.", e);
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation) DocumentFormatException(gate.util.DocumentFormatException) FeatureMap(gate.FeatureMap) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) FeatureMap(gate.FeatureMap) HashSet(java.util.HashSet)

Example 43 with AnnotationSet

use of gate.AnnotationSet in project gate-core by GateNLP.

the class InlineXMLExporter method export.

@Override
public void export(Document doc, OutputStream out, FeatureMap options) throws IOException {
    Integer rootID = null;
    AnnotationSet withRoot = null;
    AnnotationSet originalMarkups = null;
    AnnotationSet backupOriginalMarkups = null;
    try {
        AnnotationSet allAnnots = doc.getAnnotations((String) options.get("annotationSetName"));
        if (!(Boolean) options.get("includeOriginalMarkups")) {
            originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
            backupOriginalMarkups = new AnnotationSetImpl(originalMarkups);
            originalMarkups.clear();
        }
        // first transfer the annotation types from a list to a set
        @SuppressWarnings("unchecked") Set<String> types2Export = new HashSet<String>((List<String>) options.get("annotationTypes"));
        // then get the annotations for export
        AnnotationSet annots2Export = allAnnots.get(types2Export);
        withRoot = new AnnotationSetImpl(doc);
        withRoot.addAll(annots2Export);
        String rootType = (String) options.get("rootElement");
        if (rootType != null && !"".equals(rootType)) {
            // add the root element to the set
            rootID = withRoot.add(0L, doc.getContent().size(), (String) options.get("rootElement"), Factory.newFeatureMap());
        }
        // create a writer using the specified encoding
        OutputStreamWriter writer = new OutputStreamWriter(out, (String) options.get("encoding"));
        // write the document
        writer.write(doc.toXml(withRoot, (Boolean) options.get("includeFeatures")));
        // make sure it gets written
        writer.flush();
    } catch (InvalidOffsetException e) {
        throw new IOException(e);
    } finally {
        // delete the fake root element
        if (rootID != null)
            withRoot.remove(withRoot.get(rootID));
        // restore the original markups
        if (backupOriginalMarkups != null)
            originalMarkups.addAll(backupOriginalMarkups);
    }
}
Also used : AnnotationSetImpl(gate.annotation.AnnotationSetImpl) AnnotationSet(gate.AnnotationSet) OutputStreamWriter(java.io.OutputStreamWriter) InvalidOffsetException(gate.util.InvalidOffsetException) IOException(java.io.IOException) HashSet(java.util.HashSet)

Aggregations

AnnotationSet (gate.AnnotationSet)43 Annotation (gate.Annotation)27 ArrayList (java.util.ArrayList)14 HashMap (java.util.HashMap)11 HashSet (java.util.HashSet)11 Document (gate.Document)9 List (java.util.List)8 FeatureMap (gate.FeatureMap)7 InvalidOffsetException (gate.util.InvalidOffsetException)6 AnnotationSetImpl (gate.annotation.AnnotationSetImpl)5 Set (java.util.Set)5 StatusListener (gate.event.StatusListener)4 GateRuntimeException (gate.util.GateRuntimeException)4 Point (java.awt.Point)4 IOException (java.io.IOException)4 URL (java.net.URL)4 Map (java.util.Map)4 Color (java.awt.Color)3 TreeSet (java.util.TreeSet)3 TestDocument (gate.corpora.TestDocument)2