Search in sources :

Example 41 with AnnotationSet

use of gate.AnnotationSet in project gate-core by GateNLP.

the class EmailDocumentFormat method unpackMarkup.

 * Unpack the markup in the document. This converts markup from the
 * native format (e.g. EMAIL) into annotations in GATE format.
 * Uses the markupElementsMap to determine which elements to convert, and
 * what annotation type names to use.
 * It always tryes to parse te doc's content. It doesn't matter if the
 * sourceUrl is null or not.
 * @param doc The gate document you want to parse.
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    // End if
    // create an EmailDocumentHandler
    EmailDocumentHandler emailDocHandler = null;
    emailDocHandler = new, this.markupElementsMap, this.element2StringMap);
    StatusListener statusListener = new StatusListener() {

        public void statusChanged(String text) {
            // this is implemented in and inherited here
    // Register a status listener with it
    try {
        // Call the method that creates annotations on the gate document
        // Process the body annotations and search for paragraphs
        AnnotationSet bodyAnnotations = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
        if (bodyAnnotations != null && !bodyAnnotations.isEmpty()) {
            Iterator<Annotation> iter = bodyAnnotations.iterator();
            while (iter.hasNext()) {
                Annotation a =;
                annotateParagraphs(doc, a.getStartNode().getOffset().intValue(), a.getEndNode().getOffset().intValue(), GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
        // End while
    // End if
    } catch (IOException e) {
        throw new DocumentFormatException("Couldn't create a buffered reader ", e);
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException(e);
    } finally {
// End try
Also used : DocumentFormatException(gate.util.DocumentFormatException) EmailDocumentHandler( EmailDocumentHandler( AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) StatusListener(gate.event.StatusListener) IOException( Annotation(gate.Annotation)

Example 42 with AnnotationSet

use of gate.AnnotationSet in project gate-core by GateNLP.

the class UimaDocumentFormat method unpackCasMarkup.

 * Convert UIMA CAS markups to GATE markups.
 * @param doc XML document already parsed
 * @throws DocumentFormatException error when parsing the file
private void unpackCasMarkup(Document doc) throws DocumentFormatException {
    AnnotationSet inputAS = doc.getAnnotations("Original markups");
    AnnotationSet outputAS = doc.getAnnotations("Original markups");
    // set format specific names
    String casPrefix;
    String idName;
    if (!inputAS.get("CAS").isEmpty()) {
        casPrefix = "uima.cas.";
        idName = "_id";
    } else if (!inputAS.get("xmi:XMI").isEmpty()) {
        casPrefix = "cas:";
        idName = "xmi:id";
    } else {
        throw new DocumentFormatException("The document \"" + doc.getName() + "\" is neither of XCAS nor XMICAS format.");
    // get array/list contained elements annotations
    for (Annotation annotation : inputAS) {
        if (annotation.getType().matches(casPrefix + "[a-zA-Z]+(List|Array)")) {
            try {
                String elements = doc.getContent().getContent(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()).toString();
                // add contained values as a feature to the array annotation
                if (!elements.trim().equals("")) {
                    annotation.getFeatures().put("elements", elements);
            } catch (InvalidOffsetException e) {
                throw new DocumentFormatException(e);
    // get document content from SOFA annotations
    Set<Annotation> sofaSet = inputAS.get(casPrefix + "Sofa");
    if (sofaSet.size() > 1) {
        Out.prln("More than one UIMA SOFA, annotation offsets won't be correct.");
    StringBuilder documentContent = new StringBuilder();
    for (Annotation annotation : sofaSet) {
        documentContent.append((String) annotation.getFeatures().get("sofaString"));
    doc.setContent(new DocumentContentImpl(documentContent.toString()));
    // remove SOFA annotations
    // remove non document annotations
    // get the views members, views will be added later as annotation sets
    List<List<String>> viewList = new ArrayList<List<String>>();
    for (Annotation view : inputAS.get(casPrefix + "View")) {
        viewList.add(Arrays.asList(((String) view.getFeatures().get("members")).split("\\s+")));
    inputAS.removeAll(inputAS.get(casPrefix + "View"));
    // fill a map with the id as key and the entity name as value
    // this is specific to the Temis Luxid CAS format
    Map<String, String> entityMap = new HashMap<String, String>();
    for (Annotation entity : inputAS.get("com.temis.uima.Entity")) {
        FeatureMap features = entity.getFeatures();
        entityMap.put((String) features.get(idName), (String) features.get("value"));
    try {
        // for each UIMA annotation
        for (Annotation annotation : new HashSet<Annotation>(inputAS)) {
            FeatureMap features = Factory.newFeatureMap();
            String start = (String) features.get("begin");
            String end = (String) features.get("end");
            String id = (String) features.get(idName);
            // UIMA feature
            // UIMA feature
            // GATE feature
            // UIMA XCAS feature
            if (start == null || end == null) {
                // no offsets so add it as a GATE document feature
                for (Map.Entry<Object, Object> entry : features.entrySet()) {
                    doc.getFeatures().put(annotation.getType() + '_' + id + '.' + entry.getKey(), entry.getValue());
            } else {
                // offsets so add it as a GATE document annotation
                String entityReference = (String) features.get("_ref_entity");
                String type = entityMap.containsKey(entityReference) ? entityMap.get(entityReference) : annotation.getType();
                Integer gateId = outputAS.add(Long.valueOf(start), Long.valueOf(end), type, features);
                int viewCount = 0;
                for (List<String> viewMembers : viewList) {
                    if (viewMembers.contains(id)) {
                        // add the annotation to the annotation set
                        doc.getAnnotations("CasView" + viewCount).add(outputAS.get(gateId));
            // delete UIMA annotation
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException("Couldn't create annotation.", e);
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation) DocumentFormatException(gate.util.DocumentFormatException) FeatureMap(gate.FeatureMap) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) FeatureMap(gate.FeatureMap) HashSet(java.util.HashSet)

Example 43 with AnnotationSet

use of gate.AnnotationSet in project gate-core by GateNLP.

the class InlineXMLExporter method export.

public void export(Document doc, OutputStream out, FeatureMap options) throws IOException {
    Integer rootID = null;
    AnnotationSet withRoot = null;
    AnnotationSet originalMarkups = null;
    AnnotationSet backupOriginalMarkups = null;
    try {
        AnnotationSet allAnnots = doc.getAnnotations((String) options.get("annotationSetName"));
        if (!(Boolean) options.get("includeOriginalMarkups")) {
            originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
            backupOriginalMarkups = new AnnotationSetImpl(originalMarkups);
        // first transfer the annotation types from a list to a set
        @SuppressWarnings("unchecked") Set<String> types2Export = new HashSet<String>((List<String>) options.get("annotationTypes"));
        // then get the annotations for export
        AnnotationSet annots2Export = allAnnots.get(types2Export);
        withRoot = new AnnotationSetImpl(doc);
        String rootType = (String) options.get("rootElement");
        if (rootType != null && !"".equals(rootType)) {
            // add the root element to the set
            rootID = withRoot.add(0L, doc.getContent().size(), (String) options.get("rootElement"), Factory.newFeatureMap());
        // create a writer using the specified encoding
        OutputStreamWriter writer = new OutputStreamWriter(out, (String) options.get("encoding"));
        // write the document
        writer.write(doc.toXml(withRoot, (Boolean) options.get("includeFeatures")));
        // make sure it gets written
    } catch (InvalidOffsetException e) {
        throw new IOException(e);
    } finally {
        // delete the fake root element
        if (rootID != null)
        // restore the original markups
        if (backupOriginalMarkups != null)
Also used : AnnotationSetImpl(gate.annotation.AnnotationSetImpl) AnnotationSet(gate.AnnotationSet) OutputStreamWriter( InvalidOffsetException(gate.util.InvalidOffsetException) IOException( HashSet(java.util.HashSet)


AnnotationSet (gate.AnnotationSet)43 Annotation (gate.Annotation)27 ArrayList (java.util.ArrayList)14 HashMap (java.util.HashMap)11 HashSet (java.util.HashSet)11 Document (gate.Document)9 List (java.util.List)8 FeatureMap (gate.FeatureMap)7 InvalidOffsetException (gate.util.InvalidOffsetException)6 AnnotationSetImpl (gate.annotation.AnnotationSetImpl)5 Set (java.util.Set)5 StatusListener (gate.event.StatusListener)4 GateRuntimeException (gate.util.GateRuntimeException)4 Point (java.awt.Point)4 IOException ( URL ( Map (java.util.Map)4 Color (java.awt.Color)3 TreeSet (java.util.TreeSet)3 TestDocument (gate.corpora.TestDocument)2