Search in sources :

Example 6 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class Annotandum method unpackMarkup.

public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    String[] lines = doc.getContent().toString().split("[\\n\\r]+");
    StringBuilder newContent = new StringBuilder();
    // Items of data to be turned into Original markups annotations
    List<Annotandum> annotanda = new ArrayList<Annotandum>();
    // Currently open tags: created by "B-FOO", extended by "I-FOO", closed
    // by "O" or end of sentence.
    Map<String, Annotandum> inProgress = new HashMap<String, Annotandum>();
    /* Note: I-Foo handling currently has a weak spot.
     * this    B-Foo
     * is      B-Bar
     * strange I-Foo
     * will result in a Foo annotation spanning "this is strange", because
     * the I-Foo extends the existing B-Foo.  If the sentence is cut off 
     * before hitting another I-Foo, however, the Foo annotation will not
     * have been extended.  But this situation will not occur in carefully
     * edited input.  
    long oldEnd = 0L;
    long start = 0L;
    long end = 0L;
    for (String line : lines) {
        oldEnd = end;
        start = newContent.length();
        String[] items = line.split("\\s+");
        // any annotations in progress
        if (items.length == 0) {
            end = newContent.length();
            finishAllTags(inProgress, annotanda, oldEnd);
        } else {
            String token = items[0];
            // We've agreed to put the space after every token.
            end = newContent.length();
            newContent.append(' ');
            // Create Token and following SpaceToken annotation.
            annotanda.add(Annotandum.makeToken(start, end, token));
            for (int column = 1; column < items.length; column++) {
                // O means close all annotations in progress
                if (items[column].equals("O")) {
                    finishAllTags(inProgress, annotanda, oldEnd);
                } else // annotation, after closing any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("U-")) {
                    String type = items[column].substring(2);
                    finishTag(type, inProgress, annotanda, oldEnd);
                    annotanda.add(new Annotandum(type, start, end, column, true));
                } else // close any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("L-")) {
                    String type = items[column].substring(2);
                    if (inProgress.containsKey(type)) {
                        // good L-FOO, so update the end offset
                        inProgress.get(type).endOffset = end;
                    } else {
                        // bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
                        inProgress.put(type, new Annotandum(type, start, end, column, true));
                    finishTag(type, inProgress, annotanda, end);
                } else // after closing any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("B-")) {
                    String type = items[column].substring(2);
                    finishTag(type, inProgress, annotanda, oldEnd);
                    inProgress.put(type, new Annotandum(type, start, end, column, true));
                } else // "I-FOO": extend current "FOO" annotation
                if ((items[column].length() > 2) && items[column].startsWith("I-")) {
                    String type = items[column].substring(2);
                    if (inProgress.containsKey(type)) {
                        // good I-FOO, so update the end offset
                        inProgress.get(type).endOffset = end;
                    } else {
                        // bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
                        inProgress.put(type, new Annotandum(type, start, end, column, true));
                } else // "FOO": treat as single-token annotation (such as POS tag)
                    Annotandum tag = new Annotandum(items[column], start, end, column, false);
    // end of input: close any remaining annotations
    finishAllTags(inProgress, annotanda, end);
    // set new content & create Original markups annotations
    try {
        DocumentContent newContentImpl = new DocumentContentImpl(newContent.toString());
        doc.edit(0L, doc.getContent().size(), newContentImpl);
        long newSize = doc.getContent().size();
        AnnotationSet originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
        for (Annotandum ann : annotanda) {
            if (DEBUG) {
                String string = Utils.stringFor(doc, ann.startOffset, (ann.endOffset <= newSize) ? ann.endOffset : newSize);
                System.out.format("%d  %d  %s  %s\n", ann.startOffset, ann.endOffset, ann.type, string);
            originalMarkups.add(ann.startOffset, ann.endOffset, ann.type, ann.features);
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException(e);
Also used : InvalidOffsetException(gate.util.InvalidOffsetException) DocumentFormatException(gate.util.DocumentFormatException)

Example 7 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class AnnotationSetImpl method getNodes.

 * Returns the nodes corresponding to the Longs. The Nodes are created if
 * they don't exist.
private final Node[] getNodes(Long start, Long end) throws InvalidOffsetException {
    // are the offsets valid?
    if (!doc.isValidOffsetRange(start, end)) {
        throw new InvalidOffsetException("Offsets [" + start + ":" + end + "] not valid for this document of size " + doc.getContent().size());
    // to find out if nodes need creating or if they exist already
    if (nodesByOffset == null) {
    // find existing nodes if appropriate nodes don't already exist,
    // create them
    Node startNode = nodesByOffset.get(start);
    if (startNode == null)
        startNode = new NodeImpl(doc.getNextNodeId(), start);
    Node endNode = null;
    if (start.equals(end)) {
        endNode = startNode;
        return new Node[] { startNode, endNode };
    endNode = nodesByOffset.get(end);
    if (endNode == null)
        endNode = new NodeImpl(doc.getNextNodeId(), end);
    return new Node[] { startNode, endNode };
Also used : Node(gate.Node) InvalidOffsetException(gate.util.InvalidOffsetException)

Example 8 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class NekoHtmlDocumentHandler method endDocument.

 * Called when the parser reaches the end of the document. Here we
 * store the new content and construct the Original markups
 * annotations.
public void endDocument(Augmentations augs) throws XNIException {
    if (DEBUG_GENERAL) {
    CustomObject obj = null;
    // replace the old content with the new one
    doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
    // set from this gate document
    if (basicAS == null)
        basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    // sort colector ascending on its id
    // iterate through colector and construct annotations
    while (!colector.isEmpty()) {
        obj = colector.getFirst();
        // Construct an annotation from this obj
        try {
            basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
        } catch (InvalidOffsetException e) {
            Err.prln("Error creating an annot :" + obj + " Discarded...");
    // end try
    // }// end if
    // while
    // notify the listener about the total amount of elements that
    // has been processed
    fireStatusChangedEvent("Total elements : " + elements);
Also used : DocumentContentImpl(gate.corpora.DocumentContentImpl) InvalidOffsetException(gate.util.InvalidOffsetException)

Example 9 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class LuceneDocument method createTokens.

private boolean createTokens(gate.Document gateDocument, AnnotationSet set) {
    String gateContent = gateDocument.getContent().toString();
    int start = -1;
    for (int i = 0; i < gateContent.length(); i++) {
        char c = gateContent.charAt(i);
        if (Character.isWhitespace(c)) {
            if (start != -1) {
                FeatureMap features = gate.Factory.newFeatureMap();
                String string = gateContent.substring(start, i);
                if (string.trim().length() > 0) {
                    features.put("string", string);
                    try {
                        set.add(Long.valueOf(start), Long.valueOf(i), Constants.ANNIC_TOKEN, features);
                    } catch (InvalidOffsetException ioe) {
                        return false;
                start = i + 1;
        } else {
            if (start == -1)
                start = i;
    if (start == -1)
        return false;
    if (start < gateContent.length()) {
        FeatureMap features = gate.Factory.newFeatureMap();
        String string = gateContent.substring(start, gateContent.length());
        if (string.trim().length() > 0) {
            features.put("string", string);
            try {
                set.add(Long.valueOf(start), Long.valueOf(gateContent.length()), Constants.ANNIC_TOKEN, features);
            } catch (InvalidOffsetException ioe) {
                return false;
    return true;
Also used : FeatureMap(gate.FeatureMap) InvalidOffsetException(gate.util.InvalidOffsetException)

Example 10 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class LuceneDocument method getTokens.

 * This method given a GATE document and other required parameters, for each
 * annotation of type indexUnitAnnotationType creates a separate list of
 * baseTokens underlying in it.
private List<Token>[] getTokens(gate.Document document, AnnotationSet inputAs, List<String> featuresToInclude, List<String> featuresToExclude, String baseTokenAnnotationType, AnnotationSet baseTokenSet, String indexUnitAnnotationType, AnnotationSet indexUnitSet, Set<String> indexedFeatures) {
    boolean excludeFeatures = false;
    boolean includeFeatures = false;
    // features
    if (!featuresToInclude.isEmpty()) {
        includeFeatures = true;
    } else if (!featuresToExclude.isEmpty()) {
        excludeFeatures = true;
    HashSet<OffsetGroup> unitOffsetsSet = new HashSet<OffsetGroup>();
    if (indexUnitAnnotationType == null || indexUnitAnnotationType.trim().length() == 0 || indexUnitSet == null || indexUnitSet.size() == 0) {
        // the index Unit Annotation Type is not specified
        // therefore we consider the entire document as a single unit
        OffsetGroup group = new OffsetGroup();
        group.startOffset = 0L;
        group.endOffset = document.getContent().size();
    } else {
        Iterator<Annotation> iter = indexUnitSet.iterator();
        while (iter.hasNext()) {
            Annotation annotation =;
            OffsetGroup group = new OffsetGroup();
            group.startOffset = annotation.getStartNode().getOffset();
            group.endOffset = annotation.getEndNode().getOffset();
    Set<String> allTypes = new HashSet<String>();
    for (String aType : inputAs.getAllTypes()) {
        if (aType.indexOf(".") > -1 || aType.indexOf("=") > -1 || aType.indexOf(";") > -1 || aType.indexOf(",") > -1) {
            System.err.println("Annotations of type " + aType + " cannot be indexed as the type name contains one of the ., =, or ; character");
    if (baseTokenSet != null && baseTokenSet.size() > 0) {
    if (indexUnitSet != null && indexUnitSet.size() > 0)
    AnnotationSet toUseSet = new AnnotationSetImpl(document);
    for (String type : allTypes) {
        for (Annotation a : inputAs.get(type)) {
            try {
                toUseSet.add(a.getStartNode().getOffset(), a.getEndNode().getOffset(), a.getType(), a.getFeatures());
            } catch (InvalidOffsetException ioe) {
                throw new GateRuntimeException(ioe);
    @SuppressWarnings({ "cast", "unchecked", "rawtypes" }) List<Token>[] toReturn = (List<Token>[]) new List[unitOffsetsSet.size()];
    Iterator<OffsetGroup> iter = unitOffsetsSet.iterator();
    int counter = 0;
    while (iter.hasNext()) {
        OffsetGroup group =;
        List<Token> newTokens = new ArrayList<Token>();
        List<Annotation> tokens = new ArrayList<Annotation>(toUseSet.getContained(group.startOffset, group.endOffset));
        // add tokens from the baseTokenSet
        if (baseTokenSet != null && baseTokenSet.size() != 0) {
            tokens.addAll(baseTokenSet.getContained(group.startOffset, group.endOffset));
        if (tokens.isEmpty())
            return null;
        Collections.sort(tokens, new OffsetComparator());
        int position = -1;
        for (int i = 0; i < tokens.size(); i++) {
            byte inc = 1;
            Annotation annot = tokens.get(i);
            String type = annot.getType();
            // if the feature is specified in featuresToExclude -exclude it
            if (excludeFeatures && featuresToExclude.contains(type))
            // exclude it
            if (includeFeatures && !featuresToInclude.contains(type))
            int startOffset = annot.getStartNode().getOffset().intValue();
            int endOffset = annot.getEndNode().getOffset().intValue();
            String text = document.getContent().toString().substring(startOffset, endOffset);
            Token token1 = new Token(type, startOffset, endOffset, "*");
            // we add extra info of position
            if (i > 0) {
                if (annot.getStartNode().getOffset().longValue() == tokens.get(i - 1).getStartNode().getOffset().longValue()) {
                    inc = 0;
            position += inc;
            if (!type.equals(baseTokenAnnotationType) || (annot.getFeatures().get("string") == null)) {
                // we need to create one string feature for this
                Token tk1 = new Token(text, startOffset, endOffset, type + ".string");
                indexedFeatures.add(type + ".string");
            // now find out the features and add them
            FeatureMap features = annot.getFeatures();
            Iterator<Object> fIter = features.keySet().iterator();
            while (fIter.hasNext()) {
                String type1 =;
                // it
                if (excludeFeatures && featuresToExclude.contains(type + "." + type1)) {
                // exclude it
                if (includeFeatures && !featuresToInclude.contains(type + "." + type1))
                Object tempText = features.get(type1);
                if (tempText == null)
                String text1 = tempText.toString();
                // we need to qualify the type names
                // for each annotation type feature we add AT.Feature=="**" to be able
                // to search for it
                // to calculate stats
                Token tempToken = new Token(text1, startOffset, endOffset, type + "." + type1);
                indexedFeatures.add(type + "." + type1);
                Token onlyATFeature = new Token(type + "." + type1, startOffset, endOffset, "**");
        toReturn[counter] = newTokens;
    return toReturn;
Also used : ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) Token(gate.creole.annic.apache.lucene.analysis.Token) GateRuntimeException(gate.util.GateRuntimeException) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation) FeatureMap(gate.FeatureMap) AnnotationSetImpl(gate.annotation.AnnotationSetImpl) OffsetComparator(gate.util.OffsetComparator)


InvalidOffsetException (gate.util.InvalidOffsetException)15 Annotation (gate.Annotation)6 AnnotationSet (gate.AnnotationSet)5 ArrayList (java.util.ArrayList)5 HashSet (java.util.HashSet)4 FeatureMap (gate.FeatureMap)3 AnnotationSetImpl (gate.annotation.AnnotationSetImpl)3 DocumentFormatException (gate.util.DocumentFormatException)3 GateRuntimeException (gate.util.GateRuntimeException)3 IOException ( List (java.util.List)3 DocumentContentImpl (gate.corpora.DocumentContentImpl)2 XMLStreamException ( CorpusExporter (gate.CorpusExporter)1 Document (gate.Document)1 Node (gate.Node)1 Token (gate.creole.annic.apache.lucene.analysis.Token)1 Document (gate.creole.annic.apache.lucene.document.Document)1 EmailDocumentHandler ( StatusListener (gate.event.StatusListener)1