Search in sources :

Example 1 with AnnotationSetImpl

use of gate.annotation.AnnotationSetImpl in project gate-core by GateNLP.

the class LuceneDocument method createDocuments.

/**
 * Given an instance of Gate Document, it converts it into the format that
 * lucene can understand and can store in its indexes. This method also stores
 * the tokenStream on the disk in order to retrieve it at the time of
 * searching
 */
public List<Document> createDocuments(String corpusPersistenceID, gate.Document gateDoc, String documentID, List<String> annotSetsToInclude, List<String> annotSetsToExclude, List<String> featuresToInclude, List<String> featuresToExclude, String indexLocation, String baseTokenAnnotationType, Boolean createTokensAutomatically, String indexUnitAnnotationType) {
    if (baseTokenAnnotationType != null)
        baseTokenAnnotationType = baseTokenAnnotationType.trim();
    List<Document> toReturnBack = new ArrayList<Document>();
    List<String> annotSetsToIndex = new ArrayList<String>();
    // about annotation sets to exclude
    if (annotSetsToInclude.size() > 0) {
        annotSetsToIndex = annotSetsToInclude;
    // if there's only one annotation to index, we don't need to
    // create a MergeSet
    // if(annotSetsToIndex.size() == 1) createMergeSet = false;
    } else if (annotSetsToExclude.size() > 0) {
        // if there were no annotation sets to include, check if user has
        // provided any annotation sets to exclude
        // if so, we need to index all annotation sets but provided in the
        // annotationsetstoexclude list
        Set<String> namedAnnotSets = new HashSet<String>();
        if (gateDoc.getNamedAnnotationSets() != null && gateDoc.getNamedAnnotationSets().keySet() != null) {
            namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
        }
        for (String setName : namedAnnotSets) {
            if (annotSetsToExclude.contains(setName))
                continue;
            annotSetsToIndex.add(setName);
        }
        if (!annotSetsToExclude.contains(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
            annotSetsToIndex.add(Constants.DEFAULT_ANNOTATION_SET_NAME);
        }
    } else {
        // if both annotation sets to include and annotation sets to
        // exclude are empty
        // we need to index all annotation sets
        Set<String> namedAnnotSets = new HashSet<String>();
        if (gateDoc.getNamedAnnotationSets() != null && gateDoc.getNamedAnnotationSets().keySet() != null) {
            namedAnnotSets = gateDoc.getNamedAnnotationSets().keySet();
        }
        for (String setName : namedAnnotSets) {
            annotSetsToIndex.add(setName);
        }
        annotSetsToIndex.add(Constants.DEFAULT_ANNOTATION_SET_NAME);
    }
    // lets find out the annotation set that contains tokens in it
    AnnotationSet baseTokenAnnotationSet = null;
    // search in annotation sets to find out which of them has the
    // baseTokenAnnotationType annotations
    // initially this is set to false
    boolean searchBaseTokensInAllAnnotationSets = false;
    boolean searchIndexUnitInAllAnnotationSets = false;
    // this variable tells whether we want to create manual tokens or
    // not
    boolean createManualTokens = false;
    // lets check if user's input is setName.basetokenAnnotationType
    int index = -1;
    if (baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0)
        index = baseTokenAnnotationType.lastIndexOf('.');
    // basetokenAnnotationType
    if (index >= 0) {
        // set name
        String setName = baseTokenAnnotationType.substring(0, index);
        // token type
        baseTokenAnnotationType = baseTokenAnnotationType.substring(index + 1, baseTokenAnnotationType.length());
        // annotation set
        if (setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
            baseTokenAnnotationSet = gateDoc.getAnnotations().get(baseTokenAnnotationType);
        else
            baseTokenAnnotationSet = gateDoc.getAnnotations(setName).get(baseTokenAnnotationType);
        // base token annotation type
        if (baseTokenAnnotationSet == null || baseTokenAnnotationSet.size() == 0) {
            System.err.println("Base Tokens " + baseTokenAnnotationType + " counldn't be found under the specified annotation set " + setName + "\n searching them in other annotation sets");
            searchBaseTokensInAllAnnotationSets = true;
        }
    } else {
        // either baseTokenAnnotation type is null or user hasn't provided
        // any annotaiton set name
        // so we search in all annotation sets
        searchBaseTokensInAllAnnotationSets = true;
    }
    if (baseTokenAnnotationType != null && baseTokenAnnotationType.length() > 0 && searchBaseTokensInAllAnnotationSets) {
        // we set this to true and if we find basetokens in any of the
        // annotationsets to index
        // we will set this to false
        createManualTokens = true;
        for (String aSet : annotSetsToIndex) {
            if (aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
                AnnotationSet tempSet = gateDoc.getAnnotations().get(baseTokenAnnotationType);
                if (tempSet.size() > 0) {
                    baseTokenAnnotationSet = tempSet;
                    // System.out.println("found in default annotation set");
                    createManualTokens = false;
                    break;
                }
            } else {
                AnnotationSet tempSet = gateDoc.getAnnotations(aSet).get(baseTokenAnnotationType);
                if (tempSet.size() > 0) {
                    baseTokenAnnotationSet = tempSet;
                    // System.out.println("found in "+aSet);
                    createManualTokens = false;
                    break;
                }
            }
        }
    }
    // we'll have to create tokens ourselves
    if (baseTokenAnnotationType == null || baseTokenAnnotationType.length() == 0)
        createManualTokens = true;
    // lets check if we have to create ManualTokens
    if (createManualTokens) {
        if (!createTokensAutomatically.booleanValue()) {
            System.out.println("Tokens couldn't be found in the document - Ignoring the document " + gateDoc.getName());
            return null;
        }
        baseTokenAnnotationType = Constants.ANNIC_TOKEN;
        if (baseTokenAnnotationSet == null) {
            baseTokenAnnotationSet = new AnnotationSetImpl(gateDoc);
        }
        if (!createTokens(gateDoc, baseTokenAnnotationSet)) {
            System.out.println("Tokens couldn't be created manually - Ignoring the document " + gateDoc.getName());
            return null;
        }
    }
    // by now, baseTokenAnnotationSet will not be null for sure and we
    // know what's the baseTokenAnnotationType
    // lets find out the annotation set that contains
    // indexUnitAnnotationType in it
    AnnotationSet indexUnitAnnotationSet = null;
    // lets check if user has provided setName.indexUnitAnnotationType
    index = -1;
    if (indexUnitAnnotationType != null && indexUnitAnnotationType.trim().length() > 0)
        index = indexUnitAnnotationType.lastIndexOf('.');
    // indexUnitAnnotationType
    if (index >= 0) {
        // setName
        String setName = indexUnitAnnotationType.substring(0, index);
        // indexUnitAnnotationType
        indexUnitAnnotationType = indexUnitAnnotationType.substring(index + 1, indexUnitAnnotationType.length());
        if (setName.equals(Constants.DEFAULT_ANNOTATION_SET_NAME))
            indexUnitAnnotationSet = gateDoc.getAnnotations().get(indexUnitAnnotationType);
        else
            indexUnitAnnotationSet = gateDoc.getAnnotations(setName).get(indexUnitAnnotationType);
        // if so, we'll have to search other annotation sets
        if (indexUnitAnnotationSet == null || indexUnitAnnotationSet.size() == 0) {
            System.err.println("Index Unit " + indexUnitAnnotationType + " counldn't be found under the specified annotation set " + setName + "\n searching them in other annotation sets");
            searchIndexUnitInAllAnnotationSets = true;
        }
    } else {
        // either indexUnitAnnotationType is null or user hasn't provided
        // the setname
        searchIndexUnitInAllAnnotationSets = true;
    }
    // searching in all annotation set names
    if (indexUnitAnnotationType != null && indexUnitAnnotationType.length() > 0 && searchIndexUnitInAllAnnotationSets) {
        for (String aSet : annotSetsToIndex) {
            if (aSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME)) {
                AnnotationSet tempSet = gateDoc.getAnnotations().get(indexUnitAnnotationType);
                if (tempSet.size() > 0) {
                    indexUnitAnnotationSet = tempSet;
                    break;
                }
            } else {
                AnnotationSet tempSet = gateDoc.getAnnotations(aSet).get(indexUnitAnnotationType);
                if (tempSet.size() > 0) {
                    indexUnitAnnotationSet = tempSet;
                    break;
                }
            }
        }
    }
    // to null as well
    if (indexUnitAnnotationSet == null) {
        indexUnitAnnotationType = null;
    }
    int j = 0;
    for (String annotSet : annotSetsToIndex) {
        // we need to generate the Token Stream here, and send it to the
        // GateLuceneReader
        AnnotationSet aSetToIndex = annotSet.equals(Constants.DEFAULT_ANNOTATION_SET_NAME) ? gateDoc.getAnnotations() : gateDoc.getAnnotations(annotSet);
        Set<String> indexedFeatures = new HashSet<String>();
        // tempBaseTokenAnnotationSet is not null
        List<Token>[] tokenStreams = getTokens(gateDoc, aSetToIndex, featuresToInclude, featuresToExclude, baseTokenAnnotationType, baseTokenAnnotationSet, indexUnitAnnotationType, indexUnitAnnotationSet, indexedFeatures);
        // tokenStream is set to null
        if (tokenStreams == null)
            return null;
        // this is enabled only if there are more than one annotation sets
        // available to search in
        // if(createMergeSet) {
        // if(mergedSet == null) mergedSet = new AnnotationSetImpl(gateDoc);
        // 
        // // we need to merge all annotations but the
        // // baseTokenAnnotationType
        // for(String aType : aSetToIndex.getAllTypes()) {
        // 
        // if(aType.equals(baseTokenAnnotationType)) {
        // continue;
        // }
        // 
        // if(indexUnitAnnotationType != null
        // && aType.equals(indexUnitAnnotationType)) {
        // continue;
        // }
        // 
        // for(Annotation a : aSetToIndex.get(aType)) {
        // try {
        // mergedSet.add(a.getStartNode().getOffset(), a.getEndNode()
        // .getOffset(), a.getType(), a.getFeatures());
        // }
        // catch(InvalidOffsetException ioe) {
        // throw new GateRuntimeException(ioe);
        // }
        // }
        // 
        // }
        // }
        StringBuffer indexedFeaturesString = new StringBuffer();
        for (String aFeat : indexedFeatures) {
            indexedFeaturesString.append(aFeat + ";");
        }
        Document[] toReturn = new Document[tokenStreams.length];
        for (int i = 0; i < tokenStreams.length; i++, j++) {
            // make a new, empty document
            Document doc = new Document();
            // and then create the document
            LuceneReader reader = new LuceneReader(gateDoc, tokenStreams[i]);
            doc.add(Field.Keyword(Constants.DOCUMENT_ID, documentID));
            doc.add(Field.Keyword(Constants.DOCUMENT_ID_FOR_SERIALIZED_FILE, documentID + "-" + j));
            doc.add(Field.Keyword(Constants.INDEXED_FEATURES, indexedFeaturesString.substring(0, indexedFeaturesString.length() - 1)));
            if (corpusPersistenceID != null)
                doc.add(Field.Keyword(Constants.CORPUS_ID, corpusPersistenceID));
            doc.add(Field.Keyword(Constants.ANNOTATION_SET_ID, annotSet));
            doc.add(Field.Text("contents", reader));
            // here we store token stream on the file system
            try {
                writeOnDisk(tokenStreams[i], documentID, documentID + "-" + j, indexLocation);
            } catch (Exception e) {
                Err.println("\nIgnoring the document : " + gateDoc.getName() + " since its token stream cannot be written on the disk");
                Err.println("Reason: " + e.getMessage());
                return null;
            }
            // return the document
            toReturn[i] = doc;
        }
        toReturnBack.addAll(Arrays.asList(toReturn));
    }
    return toReturnBack;
}
Also used : HashSet(java.util.HashSet) AnnotationSet(gate.AnnotationSet) Set(java.util.Set) ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) Document(gate.creole.annic.apache.lucene.document.Document) InvalidOffsetException(gate.util.InvalidOffsetException) GateRuntimeException(gate.util.GateRuntimeException) IOException(java.io.IOException) AnnotationSetImpl(gate.annotation.AnnotationSetImpl) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet)

Example 2 with AnnotationSetImpl

use of gate.annotation.AnnotationSetImpl in project gate-core by GateNLP.

the class DocumentImpl method getAnnotations.

// getAnnotations()
/**
 * Get a named set of annotations. Creates a new set if one with this name
 * doesn't exist yet. If the provided name is null or the empty string then
 * it returns the default annotation set.
 */
@Override
public AnnotationSet getAnnotations(String name) {
    if (name == null || "".equals(name))
        return getAnnotations();
    if (namedAnnotSets == null) {
        namedAnnotSets = new HashMap<String, AnnotationSet>();
    }
    AnnotationSet namedSet = namedAnnotSets.get(name);
    if (namedSet == null) {
        namedSet = new AnnotationSetImpl(this, name);
        namedAnnotSets.put(name, namedSet);
        DocumentEvent evt = new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_ADDED, name);
        fireAnnotationSetAdded(evt);
    }
    return namedSet;
}
Also used : AnnotationSetImpl(gate.annotation.AnnotationSetImpl) AnnotationSet(gate.AnnotationSet) DocumentEvent(gate.event.DocumentEvent)

Example 3 with AnnotationSetImpl

use of gate.annotation.AnnotationSetImpl in project gate-core by GateNLP.

the class DocumentImpl method getAnnotations.

/**
 * Get the default set of annotations. The set is created if it doesn't exist
 * yet.
 */
@Override
public AnnotationSet getAnnotations() {
    if (defaultAnnots == null) {
        defaultAnnots = new AnnotationSetImpl(this, "");
        fireAnnotationSetAdded(new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_ADDED, ""));
    }
    // if
    return defaultAnnots;
}
Also used : AnnotationSetImpl(gate.annotation.AnnotationSetImpl) DocumentEvent(gate.event.DocumentEvent)

Example 4 with AnnotationSetImpl

use of gate.annotation.AnnotationSetImpl in project gate-core by GateNLP.

the class DocumentImpl method saveAnnotationSetAsXmlInOrig.

// hasOriginalContentFeatures
/**
 * This method saves all the annotations from aDumpAnnotSet and combines them
 * with the original document content, if preserved as feature.
 *
 * @param aSourceAnnotationSet
 *          is a GATE annotation set prepared to be used on the raw text from
 *          document content. If aDumpAnnotSet is <b>null<b> then an empty
 *          string will be returned.
 * @param includeFeatures
 *          is a boolean, which controls whether the annotation features and
 *          gate ID are included or not.
 * @return The XML document obtained from raw text + the information from the
 *         dump annotation set.
 */
private String saveAnnotationSetAsXmlInOrig(Set<Annotation> aSourceAnnotationSet, boolean includeFeatures) {
    StringBuffer docContStrBuff;
    String origContent;
    origContent = (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
    if (origContent == null) {
        origContent = "";
    }
    // if
    long originalContentSize = origContent.length();
    RepositioningInfo repositioning = (RepositioningInfo) getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
    docContStrBuff = new StringBuffer(origContent);
    if (aSourceAnnotationSet == null)
        return docContStrBuff.toString();
    StatusListener sListener = (StatusListener) gate.Gate.getListeners().get("gate.event.StatusListener");
    AnnotationSet originalMarkupsAnnotSet = this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    // Create a dumping annotation set on the document. It will be used for
    // dumping annotations...
    AnnotationSet dumpingSet = new AnnotationSetImpl(this);
    if (sListener != null)
        sListener.statusChanged("Constructing the dumping annotation set.");
    // Then take all the annotations from aSourceAnnotationSet and verify if
    // they can be inserted safely into the dumpingSet. Where not possible,
    // report.
    Iterator<Annotation> iter = aSourceAnnotationSet.iterator();
    Annotation currentAnnot;
    while (iter.hasNext()) {
        currentAnnot = iter.next();
        if (insertsSafety(originalMarkupsAnnotSet, currentAnnot) && insertsSafety(dumpingSet, currentAnnot)) {
            dumpingSet.add(currentAnnot);
        } else {
            Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() + ", startOffset=" + currentAnnot.getStartNode().getOffset() + ", endOffset=" + currentAnnot.getEndNode().getOffset() + ", type=" + currentAnnot.getType() + " was found to violate the" + " crossed over condition. It will be discarded");
        }
    // End if
    }
    // Here we go.
    if (sListener != null)
        sListener.statusChanged("Dumping annotations as XML");
    // /////////////////////////////////////////
    // Construct a set of annot with all IDs in asc order.
    // All annotations that end at that offset swap their place in descending
    // order. For each node write all the tags from left to right.
    // Construct the node set
    TreeSet<Long> offsets = new TreeSet<Long>();
    iter = aSourceAnnotationSet.iterator();
    while (iter.hasNext()) {
        Annotation annot = iter.next();
        offsets.add(annot.getStartNode().getOffset());
        offsets.add(annot.getEndNode().getOffset());
    }
    // iteration
    while (!offsets.isEmpty()) {
        Long offset = offsets.last();
        // Remove the offset from the set
        offsets.remove(offset);
        // Now, use it.
        // Returns a list with annotations that needs to be serialized in that
        // offset.
        List<Annotation> annotations = getAnnotationsForOffset(aSourceAnnotationSet, offset);
        // Attention: the annotation are serialized from left to right
        StringBuffer tmpBuff = new StringBuffer("");
        Stack<Annotation> stack = new Stack<Annotation>();
        // Iterate through all these annotations and serialize them
        Iterator<Annotation> it = annotations.iterator();
        Annotation a = null;
        while (it.hasNext()) {
            a = it.next();
            it.remove();
            // Test if a Ends at offset
            if (offset.equals(a.getEndNode().getOffset())) {
                // Test if a Starts at offset
                if (offset.equals(a.getStartNode().getOffset())) {
                    // Here, the annotation a Starts and Ends at the offset
                    if (null != a.getFeatures().get("isEmptyAndSpan") && "true".equals(a.getFeatures().get("isEmptyAndSpan"))) {
                        // Assert: annotation a with start == end and isEmptyAndSpan
                        tmpBuff.append(writeStartTag(a, includeFeatures, false));
                        stack.push(a);
                    } else {
                        // Assert annotation a with start == end and an empty tag
                        tmpBuff.append(writeEmptyTag(a, false));
                        // The annotation is removed from dumped set
                        aSourceAnnotationSet.remove(a);
                    }
                // End if
                } else {
                    // In this case empty the stack and write the end tag
                    while (!stack.isEmpty()) {
                        Annotation a1 = stack.pop();
                        tmpBuff.append(writeEndTag(a1));
                    }
                    // End while
                    tmpBuff.append(writeEndTag(a));
                }
            // End if
            } else {
                // at the offset
                if (offset.equals(a.getStartNode().getOffset())) {
                    // In this case empty the stack and write the end tag
                    while (!stack.isEmpty()) {
                        Annotation a1 = stack.pop();
                        tmpBuff.append(writeEndTag(a1));
                    }
                    // End while
                    tmpBuff.append(writeStartTag(a, includeFeatures, false));
                    // The annotation is removed from dumped set
                    aSourceAnnotationSet.remove(a);
                }
            // End if ( offset.equals(a.getStartNode().getOffset()) )
            }
        // End if ( offset.equals(a.getEndNode().getOffset()) )
        }
        // In this case empty the stack and write the end tag
        while (!stack.isEmpty()) {
            Annotation a1 = stack.pop();
            tmpBuff.append(writeEndTag(a1));
        }
        // End while
        long originalPosition = -1;
        boolean backPositioning = a != null && offset.equals(a.getEndNode().getOffset());
        if (backPositioning) {
            // end of the annotation correction
            originalPosition = repositioning.getOriginalPos(offset.intValue(), true);
        }
        // if
        if (originalPosition == -1) {
            originalPosition = repositioning.getOriginalPos(offset.intValue());
        }
        // Insert tmpBuff to the location where it belongs in docContStrBuff
        if (originalPosition != -1 && originalPosition <= originalContentSize) {
            docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
        } else {
            Out.prln("Error in the repositioning. The offset (" + offset.intValue() + ") could not be positioned in the original document. \n" + "Calculated position is: " + originalPosition + " placed back: " + backPositioning);
        }
    // if
    }
    // End while(!offsets.isEmpty())
    if (theRootAnnotation != null)
        docContStrBuff.append(writeEndTag(theRootAnnotation));
    return docContStrBuff.toString();
}
Also used : AnnotationSet(gate.AnnotationSet) Annotation(gate.Annotation) Stack(java.util.Stack) AnnotationSetImpl(gate.annotation.AnnotationSetImpl) TreeSet(java.util.TreeSet) StatusListener(gate.event.StatusListener)

Example 5 with AnnotationSetImpl

use of gate.annotation.AnnotationSetImpl in project gate-core by GateNLP.

the class LuceneDocument method getTokens.

/**
 * This method given a GATE document and other required parameters, for each
 * annotation of type indexUnitAnnotationType creates a separate list of
 * baseTokens underlying in it.
 */
private List<Token>[] getTokens(gate.Document document, AnnotationSet inputAs, List<String> featuresToInclude, List<String> featuresToExclude, String baseTokenAnnotationType, AnnotationSet baseTokenSet, String indexUnitAnnotationType, AnnotationSet indexUnitSet, Set<String> indexedFeatures) {
    boolean excludeFeatures = false;
    boolean includeFeatures = false;
    // features
    if (!featuresToInclude.isEmpty()) {
        includeFeatures = true;
    } else if (!featuresToExclude.isEmpty()) {
        excludeFeatures = true;
    }
    HashSet<OffsetGroup> unitOffsetsSet = new HashSet<OffsetGroup>();
    if (indexUnitAnnotationType == null || indexUnitAnnotationType.trim().length() == 0 || indexUnitSet == null || indexUnitSet.size() == 0) {
        // the index Unit Annotation Type is not specified
        // therefore we consider the entire document as a single unit
        OffsetGroup group = new OffsetGroup();
        group.startOffset = 0L;
        group.endOffset = document.getContent().size();
        unitOffsetsSet.add(group);
    } else {
        Iterator<Annotation> iter = indexUnitSet.iterator();
        while (iter.hasNext()) {
            Annotation annotation = iter.next();
            OffsetGroup group = new OffsetGroup();
            group.startOffset = annotation.getStartNode().getOffset();
            group.endOffset = annotation.getEndNode().getOffset();
            unitOffsetsSet.add(group);
        }
    }
    Set<String> allTypes = new HashSet<String>();
    for (String aType : inputAs.getAllTypes()) {
        if (aType.indexOf(".") > -1 || aType.indexOf("=") > -1 || aType.indexOf(";") > -1 || aType.indexOf(",") > -1) {
            System.err.println("Annotations of type " + aType + " cannot be indexed as the type name contains one of the ., =, or ; character");
            continue;
        }
        allTypes.add(aType);
    }
    if (baseTokenSet != null && baseTokenSet.size() > 0) {
        allTypes.remove(baseTokenAnnotationType);
    }
    if (indexUnitSet != null && indexUnitSet.size() > 0)
        allTypes.remove(indexUnitAnnotationType);
    AnnotationSet toUseSet = new AnnotationSetImpl(document);
    for (String type : allTypes) {
        for (Annotation a : inputAs.get(type)) {
            try {
                toUseSet.add(a.getStartNode().getOffset(), a.getEndNode().getOffset(), a.getType(), a.getFeatures());
            } catch (InvalidOffsetException ioe) {
                throw new GateRuntimeException(ioe);
            }
        }
    }
    @SuppressWarnings({ "cast", "unchecked", "rawtypes" }) List<Token>[] toReturn = (List<Token>[]) new List[unitOffsetsSet.size()];
    Iterator<OffsetGroup> iter = unitOffsetsSet.iterator();
    int counter = 0;
    while (iter.hasNext()) {
        OffsetGroup group = iter.next();
        List<Token> newTokens = new ArrayList<Token>();
        List<Annotation> tokens = new ArrayList<Annotation>(toUseSet.getContained(group.startOffset, group.endOffset));
        // add tokens from the baseTokenSet
        if (baseTokenSet != null && baseTokenSet.size() != 0) {
            tokens.addAll(baseTokenSet.getContained(group.startOffset, group.endOffset));
        }
        if (tokens.isEmpty())
            return null;
        Collections.sort(tokens, new OffsetComparator());
        int position = -1;
        for (int i = 0; i < tokens.size(); i++) {
            byte inc = 1;
            Annotation annot = tokens.get(i);
            String type = annot.getType();
            // if the feature is specified in featuresToExclude -exclude it
            if (excludeFeatures && featuresToExclude.contains(type))
                continue;
            // exclude it
            if (includeFeatures && !featuresToInclude.contains(type))
                continue;
            int startOffset = annot.getStartNode().getOffset().intValue();
            int endOffset = annot.getEndNode().getOffset().intValue();
            String text = document.getContent().toString().substring(startOffset, endOffset);
            Token token1 = new Token(type, startOffset, endOffset, "*");
            // we add extra info of position
            if (i > 0) {
                if (annot.getStartNode().getOffset().longValue() == tokens.get(i - 1).getStartNode().getOffset().longValue()) {
                    token1.setPositionIncrement(0);
                    inc = 0;
                }
            }
            position += inc;
            token1.setPosition(position);
            newTokens.add(token1);
            if (!type.equals(baseTokenAnnotationType) || (annot.getFeatures().get("string") == null)) {
                // we need to create one string feature for this
                Token tk1 = new Token(text, startOffset, endOffset, type + ".string");
                indexedFeatures.add(type + ".string");
                tk1.setPositionIncrement(0);
                tk1.setPosition(position);
                newTokens.add(tk1);
            }
            // now find out the features and add them
            FeatureMap features = annot.getFeatures();
            Iterator<Object> fIter = features.keySet().iterator();
            while (fIter.hasNext()) {
                String type1 = fIter.next().toString();
                // it
                if (excludeFeatures && featuresToExclude.contains(type + "." + type1)) {
                    continue;
                }
                // exclude it
                if (includeFeatures && !featuresToInclude.contains(type + "." + type1))
                    continue;
                Object tempText = features.get(type1);
                if (tempText == null)
                    continue;
                String text1 = tempText.toString();
                // we need to qualify the type names
                // for each annotation type feature we add AT.Feature=="**" to be able
                // to search for it
                // to calculate stats
                Token tempToken = new Token(text1, startOffset, endOffset, type + "." + type1);
                indexedFeatures.add(type + "." + type1);
                tempToken.setPositionIncrement(0);
                tempToken.setPosition(position);
                newTokens.add(tempToken);
                Token onlyATFeature = new Token(type + "." + type1, startOffset, endOffset, "**");
                onlyATFeature.setPosition(position);
                onlyATFeature.setPositionIncrement(0);
                newTokens.add(onlyATFeature);
            }
        }
        toReturn[counter] = newTokens;
        counter++;
    }
    return toReturn;
}
Also used : ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) Token(gate.creole.annic.apache.lucene.analysis.Token) GateRuntimeException(gate.util.GateRuntimeException) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation) FeatureMap(gate.FeatureMap) AnnotationSetImpl(gate.annotation.AnnotationSetImpl) OffsetComparator(gate.util.OffsetComparator)

Aggregations

AnnotationSetImpl (gate.annotation.AnnotationSetImpl)6 AnnotationSet (gate.AnnotationSet)5 InvalidOffsetException (gate.util.InvalidOffsetException)3 HashSet (java.util.HashSet)3 Annotation (gate.Annotation)2 DocumentEvent (gate.event.DocumentEvent)2 GateRuntimeException (gate.util.GateRuntimeException)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 FeatureMap (gate.FeatureMap)1 Token (gate.creole.annic.apache.lucene.analysis.Token)1 Document (gate.creole.annic.apache.lucene.document.Document)1 StatusListener (gate.event.StatusListener)1 OffsetComparator (gate.util.OffsetComparator)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Set (java.util.Set)1 Stack (java.util.Stack)1 TreeSet (java.util.TreeSet)1