Search in sources :

Example 66 with Annotation

use of gate.Annotation in project gate-core by GateNLP.

the class DocumentStaxUtils method writeXcesAnnotations.

/**
 * Save annotations to the given XMLStreamWriter in XCES format. The
 * writer is <i>not</i> closed by this method, that is left to the
 * caller. This method writes just the cesAna element - the XML
 * declaration must be filled in by the caller if required. Characters
 * in feature values that are illegal in XML are replaced by
 * {@link #INVALID_CHARACTER_REPLACEMENT} (a space). Feature <i>names</i>
 * are not modified, nor are annotation types - an illegal character
 * in one of these will cause the serialization to fail.
 *
 * @param annotations the annotations to save, typically an
 *          AnnotationSet
 * @param xsw the XMLStreamWriter to write to
 * @param includeId should we include the annotation IDs (as the "n"
 *          attribute on each <code>struct</code>)?
 * @throws XMLStreamException
 */
public static void writeXcesAnnotations(Collection<Annotation> annotations, XMLStreamWriter xsw, boolean includeId) throws XMLStreamException {
    List<Annotation> annotsToDump = new ArrayList<Annotation>(annotations);
    Collections.sort(annotsToDump, LONGEST_FIRST_OFFSET_COMPARATOR);
    xsw.setDefaultNamespace(XCES_NAMESPACE);
    xsw.writeStartElement(XCES_NAMESPACE, "cesAna");
    xsw.writeDefaultNamespace(XCES_NAMESPACE);
    xsw.writeAttribute("version", XCES_VERSION);
    newLine(xsw);
    String indent = "   ";
    String indentMore = indent + indent;
    for (Annotation a : annotsToDump) {
        long start = a.getStartNode().getOffset().longValue();
        long end = a.getEndNode().getOffset().longValue();
        FeatureMap fm = a.getFeatures();
        xsw.writeCharacters(indent);
        if (fm == null || fm.size() == 0) {
            xsw.writeEmptyElement(XCES_NAMESPACE, "struct");
        } else {
            xsw.writeStartElement(XCES_NAMESPACE, "struct");
        }
        xsw.writeAttribute("type", a.getType());
        xsw.writeAttribute("from", String.valueOf(start));
        xsw.writeAttribute("to", String.valueOf(end));
        // include the annotation ID as the "n" attribute if requested
        if (includeId) {
            xsw.writeAttribute("n", String.valueOf(a.getId()));
        }
        newLine(xsw);
        if (fm != null && fm.size() != 0) {
            for (Map.Entry<Object, Object> att : fm.entrySet()) {
                if (!"isEmptyAndSpan".equals(att.getKey())) {
                    xsw.writeCharacters(indentMore);
                    xsw.writeEmptyElement(XCES_NAMESPACE, "feat");
                    xsw.writeAttribute("name", String.valueOf(att.getKey()));
                    xsw.writeAttribute("value", replaceXMLIllegalCharactersInString(String.valueOf(att.getValue())));
                    newLine(xsw);
                }
            }
            xsw.writeCharacters(indent);
            xsw.writeEndElement();
            newLine(xsw);
        }
    }
    xsw.writeEndElement();
    newLine(xsw);
}
Also used : FeatureMap(gate.FeatureMap) ArrayList(java.util.ArrayList) HashMap(java.util.HashMap) Map(java.util.Map) FeatureMap(gate.FeatureMap) Annotation(gate.Annotation)

Example 67 with Annotation

use of gate.Annotation in project gate-core by GateNLP.

the class DocumentStaxUtils method writeTextWithNodes.

/**
 * Writes the content of the given document to an XMLStreamWriter as a
 * mixed content element called "TextWithNodes". At each point where
 * there is the start or end of an annotation in any annotation set on
 * the document, a "Node" element is written with an "id" feature
 * whose value is the offset of that node.
 *
 * @param doc the document whose content is to be written
 * @param annotationSets the annotations for which nodes are required.
 *          This is a collection of collections.
 * @param xsw the {@link XMLStreamWriter} to write to.
 * @param namespaceURI the namespace URI. May be empty but may not be
 *          null.
 * @throws XMLStreamException
 */
public static void writeTextWithNodes(Document doc, Collection<Collection<Annotation>> annotationSets, XMLStreamWriter xsw, String namespaceURI) throws XMLStreamException {
    String aText = doc.getContent().toString();
    // no text, so return an empty element
    if (aText == null) {
        xsw.writeEmptyElement(namespaceURI, "TextWithNodes");
        return;
    }
    // build a set of all the offsets where Nodes are required
    TreeSet<Long> offsetsSet = new TreeSet<Long>();
    if (annotationSets != null) {
        for (Collection<Annotation> set : annotationSets) {
            if (set != null) {
                for (Annotation annot : set) {
                    offsetsSet.add(annot.getStartNode().getOffset());
                    offsetsSet.add(annot.getEndNode().getOffset());
                }
            }
        }
    }
    // write the TextWithNodes element
    char[] textArray = aText.toCharArray();
    xsw.writeStartElement(namespaceURI, "TextWithNodes");
    int lastNodeOffset = 0;
    // offsetsSet iterator is in ascending order of offset, as it is a
    // SortedSet
    Iterator<Long> offsetsIterator = offsetsSet.iterator();
    while (offsetsIterator.hasNext()) {
        int offset = offsetsIterator.next().intValue();
        // write characters since the last node output
        // replace XML-illegal characters in this slice of text - we
        // have to do this here rather than on the text as a whole in
        // case the node falls between the two halves of a surrogate
        // pair (in which case both halves are illegal and must be
        // replaced).
        replaceXMLIllegalCharacters(textArray, lastNodeOffset, offset - lastNodeOffset);
        writeCharactersOrCDATA(xsw, new String(textArray, lastNodeOffset, offset - lastNodeOffset));
        xsw.writeEmptyElement(namespaceURI, "Node");
        xsw.writeAttribute("id", String.valueOf(offset));
        lastNodeOffset = offset;
    }
    // write any remaining text after the last node
    replaceXMLIllegalCharacters(textArray, lastNodeOffset, textArray.length - lastNodeOffset);
    writeCharactersOrCDATA(xsw, new String(textArray, lastNodeOffset, textArray.length - lastNodeOffset));
    // and the closing TextWithNodes
    xsw.writeEndElement();
}
Also used : TreeSet(java.util.TreeSet) Annotation(gate.Annotation)

Example 68 with Annotation

use of gate.Annotation in project gate-core by GateNLP.

the class EmailDocumentFormat method unpackMarkup.

/**
 * Unpack the markup in the document. This converts markup from the
 * native format (e.g. EMAIL) into annotations in GATE format.
 * Uses the markupElementsMap to determine which elements to convert, and
 * what annotation type names to use.
 * It always tryes to parse te doc's content. It doesn't matter if the
 * sourceUrl is null or not.
 *
 * @param doc The gate document you want to parse.
 */
@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    // End if
    setNewLineProperty(doc);
    // create an EmailDocumentHandler
    EmailDocumentHandler emailDocHandler = null;
    emailDocHandler = new gate.email.EmailDocumentHandler(doc, this.markupElementsMap, this.element2StringMap);
    StatusListener statusListener = new StatusListener() {

        @Override
        public void statusChanged(String text) {
            // this is implemented in DocumentFormat.java and inherited here
            fireStatusChanged(text);
        }
    };
    // Register a status listener with it
    emailDocHandler.addStatusListener(statusListener);
    try {
        // Call the method that creates annotations on the gate document
        emailDocHandler.annotateMessages();
        // Process the body annotations and search for paragraphs
        AnnotationSet bodyAnnotations = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
        if (bodyAnnotations != null && !bodyAnnotations.isEmpty()) {
            Iterator<Annotation> iter = bodyAnnotations.iterator();
            while (iter.hasNext()) {
                Annotation a = iter.next();
                annotateParagraphs(doc, a.getStartNode().getOffset().intValue(), a.getEndNode().getOffset().intValue(), GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
            }
        // End while
        }
    // End if
    } catch (IOException e) {
        throw new DocumentFormatException("Couldn't create a buffered reader ", e);
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException(e);
    } finally {
        emailDocHandler.removeStatusListener(statusListener);
    }
// End try
}
Also used : DocumentFormatException(gate.util.DocumentFormatException) EmailDocumentHandler(gate.email.EmailDocumentHandler) EmailDocumentHandler(gate.email.EmailDocumentHandler) AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) StatusListener(gate.event.StatusListener) IOException(java.io.IOException) Annotation(gate.Annotation)

Example 69 with Annotation

use of gate.Annotation in project gate-core by GateNLP.

the class UimaDocumentFormat method unpackCasMarkup.

/**
 * Convert UIMA CAS markups to GATE markups.
 * @param doc XML document already parsed
 * @throws DocumentFormatException error when parsing the file
 */
private void unpackCasMarkup(Document doc) throws DocumentFormatException {
    AnnotationSet inputAS = doc.getAnnotations("Original markups");
    AnnotationSet outputAS = doc.getAnnotations("Original markups");
    // set format specific names
    String casPrefix;
    String idName;
    if (!inputAS.get("CAS").isEmpty()) {
        casPrefix = "uima.cas.";
        idName = "_id";
    } else if (!inputAS.get("xmi:XMI").isEmpty()) {
        casPrefix = "cas:";
        idName = "xmi:id";
    } else {
        throw new DocumentFormatException("The document \"" + doc.getName() + "\" is neither of XCAS nor XMICAS format.");
    }
    // get array/list contained elements annotations
    for (Annotation annotation : inputAS) {
        if (annotation.getType().matches(casPrefix + "[a-zA-Z]+(List|Array)")) {
            try {
                String elements = doc.getContent().getContent(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()).toString();
                // add contained values as a feature to the array annotation
                if (!elements.trim().equals("")) {
                    annotation.getFeatures().put("elements", elements);
                }
            } catch (InvalidOffsetException e) {
                throw new DocumentFormatException(e);
            }
        }
    }
    // get document content from SOFA annotations
    Set<Annotation> sofaSet = inputAS.get(casPrefix + "Sofa");
    if (sofaSet.size() > 1) {
        Out.prln("More than one UIMA SOFA, annotation offsets won't be correct.");
    }
    StringBuilder documentContent = new StringBuilder();
    for (Annotation annotation : sofaSet) {
        documentContent.append((String) annotation.getFeatures().get("sofaString"));
    }
    doc.setContent(new DocumentContentImpl(documentContent.toString()));
    // remove SOFA annotations
    inputAS.removeAll(sofaSet);
    // remove non document annotations
    inputAS.removeAll(inputAS.get("CAS"));
    inputAS.removeAll(inputAS.get("xmi:XMI"));
    inputAS.removeAll(inputAS.get("cas:NULL"));
    // get the views members, views will be added later as annotation sets
    List<List<String>> viewList = new ArrayList<List<String>>();
    for (Annotation view : inputAS.get(casPrefix + "View")) {
        viewList.add(Arrays.asList(((String) view.getFeatures().get("members")).split("\\s+")));
    }
    inputAS.removeAll(inputAS.get(casPrefix + "View"));
    // fill a map with the id as key and the entity name as value
    // this is specific to the Temis Luxid CAS format
    Map<String, String> entityMap = new HashMap<String, String>();
    for (Annotation entity : inputAS.get("com.temis.uima.Entity")) {
        FeatureMap features = entity.getFeatures();
        entityMap.put((String) features.get(idName), (String) features.get("value"));
    }
    try {
        // for each UIMA annotation
        for (Annotation annotation : new HashSet<Annotation>(inputAS)) {
            FeatureMap features = Factory.newFeatureMap();
            features.putAll(annotation.getFeatures());
            String start = (String) features.get("begin");
            String end = (String) features.get("end");
            String id = (String) features.get(idName);
            // UIMA feature
            features.remove("begin");
            // UIMA feature
            features.remove("end");
            // GATE feature
            features.remove("isEmptyAndSpan");
            // UIMA XCAS feature
            features.remove("_indexed");
            if (start == null || end == null) {
                // no offsets so add it as a GATE document feature
                features.remove(idName);
                for (Map.Entry<Object, Object> entry : features.entrySet()) {
                    doc.getFeatures().put(annotation.getType() + '_' + id + '.' + entry.getKey(), entry.getValue());
                }
            } else {
                // offsets so add it as a GATE document annotation
                String entityReference = (String) features.get("_ref_entity");
                String type = entityMap.containsKey(entityReference) ? entityMap.get(entityReference) : annotation.getType();
                Integer gateId = outputAS.add(Long.valueOf(start), Long.valueOf(end), type, features);
                int viewCount = 0;
                for (List<String> viewMembers : viewList) {
                    if (viewMembers.contains(id)) {
                        // add the annotation to the annotation set
                        doc.getAnnotations("CasView" + viewCount).add(outputAS.get(gateId));
                    }
                    viewCount++;
                }
            }
            // delete UIMA annotation
            inputAS.remove(annotation);
        }
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException("Couldn't create annotation.", e);
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation) DocumentFormatException(gate.util.DocumentFormatException) FeatureMap(gate.FeatureMap) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) FeatureMap(gate.FeatureMap) HashSet(java.util.HashSet)

Aggregations

Annotation (gate.Annotation)69 AnnotationSet (gate.AnnotationSet)28 ArrayList (java.util.ArrayList)24 HashMap (java.util.HashMap)15 Node (gate.Node)10 HashSet (java.util.HashSet)10 List (java.util.List)10 FeatureMap (gate.FeatureMap)8 Map (java.util.Map)8 TreeSet (java.util.TreeSet)8 Document (gate.Document)7 InvalidOffsetException (gate.util.InvalidOffsetException)7 Point (java.awt.Point)6 LinkedList (java.util.LinkedList)5 Set (java.util.Set)5 StatusListener (gate.event.StatusListener)4 GateRuntimeException (gate.util.GateRuntimeException)3 Color (java.awt.Color)3 Stack (java.util.Stack)3 TreeMap (java.util.TreeMap)3