Search in sources :

Example 6 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class Annotandum method unpackMarkup.

@Override
public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
    if ((doc == null) || (doc.getSourceUrl() == null && doc.getContent() == null)) {
        throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
    }
    setNewLineProperty(doc);
    String[] lines = doc.getContent().toString().split("[\\n\\r]+");
    StringBuilder newContent = new StringBuilder();
    // Items of data to be turned into Original markups annotations
    List<Annotandum> annotanda = new ArrayList<Annotandum>();
    // Currently open tags: created by "B-FOO", extended by "I-FOO", closed
    // by "O" or end of sentence.
    Map<String, Annotandum> inProgress = new HashMap<String, Annotandum>();
    /* Note: I-Foo handling currently has a weak spot.
     * 
     * this    B-Foo
     * is      B-Bar
     * strange I-Foo
     * 
     * will result in a Foo annotation spanning "this is strange", because
     * the I-Foo extends the existing B-Foo.  If the sentence is cut off 
     * before hitting another I-Foo, however, the Foo annotation will not
     * have been extended.  But this situation will not occur in carefully
     * edited input.  
     */
    long oldEnd = 0L;
    long start = 0L;
    long end = 0L;
    for (String line : lines) {
        oldEnd = end;
        start = newContent.length();
        String[] items = line.split("\\s+");
        // any annotations in progress
        if (items.length == 0) {
            newContent.append("\n");
            end = newContent.length();
            finishAllTags(inProgress, annotanda, oldEnd);
        } else {
            String token = items[0];
            // We've agreed to put the space after every token.
            newContent.append(token);
            end = newContent.length();
            newContent.append(' ');
            // Create Token and following SpaceToken annotation.
            annotanda.add(Annotandum.makeToken(start, end, token));
            annotanda.add(Annotandum.makeSpaceToken(end));
            for (int column = 1; column < items.length; column++) {
                // O means close all annotations in progress
                if (items[column].equals("O")) {
                    finishAllTags(inProgress, annotanda, oldEnd);
                } else // annotation, after closing any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("U-")) {
                    String type = items[column].substring(2);
                    finishTag(type, inProgress, annotanda, oldEnd);
                    annotanda.add(new Annotandum(type, start, end, column, true));
                } else // close any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("L-")) {
                    String type = items[column].substring(2);
                    if (inProgress.containsKey(type)) {
                        // good L-FOO, so update the end offset
                        inProgress.get(type).endOffset = end;
                    } else {
                        // bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
                        inProgress.put(type, new Annotandum(type, start, end, column, true));
                    }
                    finishTag(type, inProgress, annotanda, end);
                } else // after closing any "FOO" already in progress
                if ((items[column].length() > 2) && items[column].startsWith("B-")) {
                    String type = items[column].substring(2);
                    finishTag(type, inProgress, annotanda, oldEnd);
                    inProgress.put(type, new Annotandum(type, start, end, column, true));
                } else // "I-FOO": extend current "FOO" annotation
                if ((items[column].length() > 2) && items[column].startsWith("I-")) {
                    String type = items[column].substring(2);
                    if (inProgress.containsKey(type)) {
                        // good I-FOO, so update the end offset
                        inProgress.get(type).endOffset = end;
                    } else {
                        // bad data, containing I-FOO without a B-FOO, so treat as if B-FOO
                        inProgress.put(type, new Annotandum(type, start, end, column, true));
                    }
                } else // "FOO": treat as single-token annotation (such as POS tag)
                {
                    Annotandum tag = new Annotandum(items[column], start, end, column, false);
                    annotanda.add(tag);
                }
            }
        }
    }
    // end of input: close any remaining annotations
    finishAllTags(inProgress, annotanda, end);
    // set new content & create Original markups annotations
    try {
        DocumentContent newContentImpl = new DocumentContentImpl(newContent.toString());
        doc.edit(0L, doc.getContent().size(), newContentImpl);
        long newSize = doc.getContent().size();
        AnnotationSet originalMarkups = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
        for (Annotandum ann : annotanda) {
            if (DEBUG) {
                String string = Utils.stringFor(doc, ann.startOffset, (ann.endOffset <= newSize) ? ann.endOffset : newSize);
                System.out.format("%d  %d  %s  %s\n", ann.startOffset, ann.endOffset, ann.type, string);
            }
            originalMarkups.add(ann.startOffset, ann.endOffset, ann.type, ann.features);
        }
    } catch (InvalidOffsetException e) {
        throw new DocumentFormatException(e);
    }
}
Also used : InvalidOffsetException(gate.util.InvalidOffsetException) DocumentFormatException(gate.util.DocumentFormatException)

Example 7 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class AnnotationSetImpl method getNodes.

/**
 * Returns the nodes corresponding to the Longs. The Nodes are created if
 * they don't exist.
 */
private final Node[] getNodes(Long start, Long end) throws InvalidOffsetException {
    // are the offsets valid?
    if (!doc.isValidOffsetRange(start, end)) {
        throw new InvalidOffsetException("Offsets [" + start + ":" + end + "] not valid for this document of size " + doc.getContent().size());
    }
    // to find out if nodes need creating or if they exist already
    if (nodesByOffset == null) {
        indexByStartOffset();
    }
    // find existing nodes if appropriate nodes don't already exist,
    // create them
    Node startNode = nodesByOffset.get(start);
    if (startNode == null)
        startNode = new NodeImpl(doc.getNextNodeId(), start);
    Node endNode = null;
    if (start.equals(end)) {
        endNode = startNode;
        return new Node[] { startNode, endNode };
    }
    endNode = nodesByOffset.get(end);
    if (endNode == null)
        endNode = new NodeImpl(doc.getNextNodeId(), end);
    return new Node[] { startNode, endNode };
}
Also used : Node(gate.Node) InvalidOffsetException(gate.util.InvalidOffsetException)

Example 8 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class NekoHtmlDocumentHandler method endDocument.

/**
 * Called when the parser reaches the end of the document. Here we
 * store the new content and construct the Original markups
 * annotations.
 */
@Override
public void endDocument(Augmentations augs) throws XNIException {
    if (DEBUG_GENERAL) {
        Out.println("endDocument");
    }
    CustomObject obj = null;
    // replace the old content with the new one
    doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
    // set from this gate document
    if (basicAS == null)
        basicAS = doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    // sort colector ascending on its id
    Collections.sort(colector);
    // iterate through colector and construct annotations
    while (!colector.isEmpty()) {
        obj = colector.getFirst();
        colector.remove(obj);
        // Construct an annotation from this obj
        try {
            basicAS.add(obj.getStart(), obj.getEnd(), obj.getElemName(), obj.getFM());
        } catch (InvalidOffsetException e) {
            Err.prln("Error creating an annot :" + obj + " Discarded...");
        }
    // end try
    // }// end if
    }
    // while
    // notify the listener about the total amount of elements that
    // has been processed
    fireStatusChangedEvent("Total elements : " + elements);
}
Also used : DocumentContentImpl(gate.corpora.DocumentContentImpl) InvalidOffsetException(gate.util.InvalidOffsetException)

Example 9 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class LuceneDocument method createTokens.

private boolean createTokens(gate.Document gateDocument, AnnotationSet set) {
    String gateContent = gateDocument.getContent().toString();
    int start = -1;
    for (int i = 0; i < gateContent.length(); i++) {
        char c = gateContent.charAt(i);
        if (Character.isWhitespace(c)) {
            if (start != -1) {
                FeatureMap features = gate.Factory.newFeatureMap();
                String string = gateContent.substring(start, i);
                if (string.trim().length() > 0) {
                    features.put("string", string);
                    try {
                        set.add(Long.valueOf(start), Long.valueOf(i), Constants.ANNIC_TOKEN, features);
                    } catch (InvalidOffsetException ioe) {
                        ioe.printStackTrace();
                        return false;
                    }
                }
                start = i + 1;
            }
        } else {
            if (start == -1)
                start = i;
        }
    }
    if (start == -1)
        return false;
    if (start < gateContent.length()) {
        FeatureMap features = gate.Factory.newFeatureMap();
        String string = gateContent.substring(start, gateContent.length());
        if (string.trim().length() > 0) {
            features.put("string", string);
            try {
                set.add(Long.valueOf(start), Long.valueOf(gateContent.length()), Constants.ANNIC_TOKEN, features);
            } catch (InvalidOffsetException ioe) {
                ioe.printStackTrace();
                return false;
            }
        }
    }
    return true;
}
Also used : FeatureMap(gate.FeatureMap) InvalidOffsetException(gate.util.InvalidOffsetException)

Example 10 with InvalidOffsetException

use of gate.util.InvalidOffsetException in project gate-core by GateNLP.

the class LuceneDocument method getTokens.

/**
 * This method given a GATE document and other required parameters, for each
 * annotation of type indexUnitAnnotationType creates a separate list of
 * baseTokens underlying in it.
 */
private List<Token>[] getTokens(gate.Document document, AnnotationSet inputAs, List<String> featuresToInclude, List<String> featuresToExclude, String baseTokenAnnotationType, AnnotationSet baseTokenSet, String indexUnitAnnotationType, AnnotationSet indexUnitSet, Set<String> indexedFeatures) {
    boolean excludeFeatures = false;
    boolean includeFeatures = false;
    // features
    if (!featuresToInclude.isEmpty()) {
        includeFeatures = true;
    } else if (!featuresToExclude.isEmpty()) {
        excludeFeatures = true;
    }
    HashSet<OffsetGroup> unitOffsetsSet = new HashSet<OffsetGroup>();
    if (indexUnitAnnotationType == null || indexUnitAnnotationType.trim().length() == 0 || indexUnitSet == null || indexUnitSet.size() == 0) {
        // the index Unit Annotation Type is not specified
        // therefore we consider the entire document as a single unit
        OffsetGroup group = new OffsetGroup();
        group.startOffset = 0L;
        group.endOffset = document.getContent().size();
        unitOffsetsSet.add(group);
    } else {
        Iterator<Annotation> iter = indexUnitSet.iterator();
        while (iter.hasNext()) {
            Annotation annotation = iter.next();
            OffsetGroup group = new OffsetGroup();
            group.startOffset = annotation.getStartNode().getOffset();
            group.endOffset = annotation.getEndNode().getOffset();
            unitOffsetsSet.add(group);
        }
    }
    Set<String> allTypes = new HashSet<String>();
    for (String aType : inputAs.getAllTypes()) {
        if (aType.indexOf(".") > -1 || aType.indexOf("=") > -1 || aType.indexOf(";") > -1 || aType.indexOf(",") > -1) {
            System.err.println("Annotations of type " + aType + " cannot be indexed as the type name contains one of the ., =, or ; character");
            continue;
        }
        allTypes.add(aType);
    }
    if (baseTokenSet != null && baseTokenSet.size() > 0) {
        allTypes.remove(baseTokenAnnotationType);
    }
    if (indexUnitSet != null && indexUnitSet.size() > 0)
        allTypes.remove(indexUnitAnnotationType);
    AnnotationSet toUseSet = new AnnotationSetImpl(document);
    for (String type : allTypes) {
        for (Annotation a : inputAs.get(type)) {
            try {
                toUseSet.add(a.getStartNode().getOffset(), a.getEndNode().getOffset(), a.getType(), a.getFeatures());
            } catch (InvalidOffsetException ioe) {
                throw new GateRuntimeException(ioe);
            }
        }
    }
    @SuppressWarnings({ "cast", "unchecked", "rawtypes" }) List<Token>[] toReturn = (List<Token>[]) new List[unitOffsetsSet.size()];
    Iterator<OffsetGroup> iter = unitOffsetsSet.iterator();
    int counter = 0;
    while (iter.hasNext()) {
        OffsetGroup group = iter.next();
        List<Token> newTokens = new ArrayList<Token>();
        List<Annotation> tokens = new ArrayList<Annotation>(toUseSet.getContained(group.startOffset, group.endOffset));
        // add tokens from the baseTokenSet
        if (baseTokenSet != null && baseTokenSet.size() != 0) {
            tokens.addAll(baseTokenSet.getContained(group.startOffset, group.endOffset));
        }
        if (tokens.isEmpty())
            return null;
        Collections.sort(tokens, new OffsetComparator());
        int position = -1;
        for (int i = 0; i < tokens.size(); i++) {
            byte inc = 1;
            Annotation annot = tokens.get(i);
            String type = annot.getType();
            // if the feature is specified in featuresToExclude -exclude it
            if (excludeFeatures && featuresToExclude.contains(type))
                continue;
            // exclude it
            if (includeFeatures && !featuresToInclude.contains(type))
                continue;
            int startOffset = annot.getStartNode().getOffset().intValue();
            int endOffset = annot.getEndNode().getOffset().intValue();
            String text = document.getContent().toString().substring(startOffset, endOffset);
            Token token1 = new Token(type, startOffset, endOffset, "*");
            // we add extra info of position
            if (i > 0) {
                if (annot.getStartNode().getOffset().longValue() == tokens.get(i - 1).getStartNode().getOffset().longValue()) {
                    token1.setPositionIncrement(0);
                    inc = 0;
                }
            }
            position += inc;
            token1.setPosition(position);
            newTokens.add(token1);
            if (!type.equals(baseTokenAnnotationType) || (annot.getFeatures().get("string") == null)) {
                // we need to create one string feature for this
                Token tk1 = new Token(text, startOffset, endOffset, type + ".string");
                indexedFeatures.add(type + ".string");
                tk1.setPositionIncrement(0);
                tk1.setPosition(position);
                newTokens.add(tk1);
            }
            // now find out the features and add them
            FeatureMap features = annot.getFeatures();
            Iterator<Object> fIter = features.keySet().iterator();
            while (fIter.hasNext()) {
                String type1 = fIter.next().toString();
                // it
                if (excludeFeatures && featuresToExclude.contains(type + "." + type1)) {
                    continue;
                }
                // exclude it
                if (includeFeatures && !featuresToInclude.contains(type + "." + type1))
                    continue;
                Object tempText = features.get(type1);
                if (tempText == null)
                    continue;
                String text1 = tempText.toString();
                // we need to qualify the type names
                // for each annotation type feature we add AT.Feature=="**" to be able
                // to search for it
                // to calculate stats
                Token tempToken = new Token(text1, startOffset, endOffset, type + "." + type1);
                indexedFeatures.add(type + "." + type1);
                tempToken.setPositionIncrement(0);
                tempToken.setPosition(position);
                newTokens.add(tempToken);
                Token onlyATFeature = new Token(type + "." + type1, startOffset, endOffset, "**");
                onlyATFeature.setPosition(position);
                onlyATFeature.setPositionIncrement(0);
                newTokens.add(onlyATFeature);
            }
        }
        toReturn[counter] = newTokens;
        counter++;
    }
    return toReturn;
}
Also used : ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) Token(gate.creole.annic.apache.lucene.analysis.Token) GateRuntimeException(gate.util.GateRuntimeException) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation) FeatureMap(gate.FeatureMap) AnnotationSetImpl(gate.annotation.AnnotationSetImpl) OffsetComparator(gate.util.OffsetComparator)

Aggregations

InvalidOffsetException (gate.util.InvalidOffsetException)15 Annotation (gate.Annotation)6 AnnotationSet (gate.AnnotationSet)5 ArrayList (java.util.ArrayList)5 HashSet (java.util.HashSet)4 FeatureMap (gate.FeatureMap)3 AnnotationSetImpl (gate.annotation.AnnotationSetImpl)3 DocumentFormatException (gate.util.DocumentFormatException)3 GateRuntimeException (gate.util.GateRuntimeException)3 IOException (java.io.IOException)3 List (java.util.List)3 DocumentContentImpl (gate.corpora.DocumentContentImpl)2 XMLStreamException (javax.xml.stream.XMLStreamException)2 CorpusExporter (gate.CorpusExporter)1 Document (gate.Document)1 Node (gate.Node)1 Token (gate.creole.annic.apache.lucene.analysis.Token)1 Document (gate.creole.annic.apache.lucene.document.Document)1 EmailDocumentHandler (gate.email.EmailDocumentHandler)1 StatusListener (gate.event.StatusListener)1