Search in sources :

Example 1 with Token

use of gate.creole.annic.apache.lucene.analysis.Token in project gate-core by GateNLP.

the class Posting method invertDocument.

// Tokenizes the fields of a document into Postings.
private final void invertDocument(Document doc) throws IOException {
    Enumeration fields = doc.fields();
    while (fields.hasMoreElements()) {
        Field field = (Field) fields.nextElement();
        String fieldName = field.name();
        int fieldNumber = fieldInfos.fieldNumber(fieldName);
        // length of field
        int length = fieldLengths[fieldNumber];
        // position in field
        int position = fieldPositions[fieldNumber];
        if (field.isIndexed()) {
            if (!field.isTokenized()) {
                // un-tokenized field
                addPosition(fieldName, field.stringValue(), "Field", /*, 1*/
                position++);
                length++;
            } else {
                // find or make Reader
                Reader reader;
                if (field.readerValue() != null)
                    reader = field.readerValue();
                else if (field.stringValue() != null)
                    reader = new StringReader(field.stringValue());
                else
                    throw new IllegalArgumentException("field must have either String or Reader value");
                // Tokenize field and add to postingTable
                TokenStream stream = analyzer.tokenStream(fieldName, reader);
                try {
                    for (Token t = stream.next(); t != null; t = stream.next()) {
                        position += (t.getPositionIncrement() - 1);
                        if (t.type() == null)
                            addPosition(fieldName, t.termText(), "*", /*, t.getPositionIncrement()*/
                            position++);
                        else
                            addPosition(fieldName, t.termText(), t.type(), /*, t.getPositionIncrement()*/
                            position++);
                        if (++length > maxFieldLength)
                            break;
                    }
                } finally {
                    stream.close();
                }
            }
            // save field length
            fieldLengths[fieldNumber] = length;
            // save field position
            fieldPositions[fieldNumber] = position;
            fieldBoosts[fieldNumber] *= field.getBoost();
        }
    }
}
Also used : Field(gate.creole.annic.apache.lucene.document.Field) TokenStream(gate.creole.annic.apache.lucene.analysis.TokenStream) Enumeration(java.util.Enumeration) StringReader(java.io.StringReader) Reader(java.io.Reader) StringReader(java.io.StringReader) Token(gate.creole.annic.apache.lucene.analysis.Token)

Example 2 with Token

use of gate.creole.annic.apache.lucene.analysis.Token in project gate-core by GateNLP.

the class LuceneDocument method getTokens.

/**
 * This method given a GATE document and other required parameters, for each
 * annotation of type indexUnitAnnotationType creates a separate list of
 * baseTokens underlying in it.
 */
private List<Token>[] getTokens(gate.Document document, AnnotationSet inputAs, List<String> featuresToInclude, List<String> featuresToExclude, String baseTokenAnnotationType, AnnotationSet baseTokenSet, String indexUnitAnnotationType, AnnotationSet indexUnitSet, Set<String> indexedFeatures) {
    boolean excludeFeatures = false;
    boolean includeFeatures = false;
    // features
    if (!featuresToInclude.isEmpty()) {
        includeFeatures = true;
    } else if (!featuresToExclude.isEmpty()) {
        excludeFeatures = true;
    }
    HashSet<OffsetGroup> unitOffsetsSet = new HashSet<OffsetGroup>();
    if (indexUnitAnnotationType == null || indexUnitAnnotationType.trim().length() == 0 || indexUnitSet == null || indexUnitSet.size() == 0) {
        // the index Unit Annotation Type is not specified
        // therefore we consider the entire document as a single unit
        OffsetGroup group = new OffsetGroup();
        group.startOffset = 0L;
        group.endOffset = document.getContent().size();
        unitOffsetsSet.add(group);
    } else {
        Iterator<Annotation> iter = indexUnitSet.iterator();
        while (iter.hasNext()) {
            Annotation annotation = iter.next();
            OffsetGroup group = new OffsetGroup();
            group.startOffset = annotation.getStartNode().getOffset();
            group.endOffset = annotation.getEndNode().getOffset();
            unitOffsetsSet.add(group);
        }
    }
    Set<String> allTypes = new HashSet<String>();
    for (String aType : inputAs.getAllTypes()) {
        if (aType.indexOf(".") > -1 || aType.indexOf("=") > -1 || aType.indexOf(";") > -1 || aType.indexOf(",") > -1) {
            System.err.println("Annotations of type " + aType + " cannot be indexed as the type name contains one of the ., =, or ; character");
            continue;
        }
        allTypes.add(aType);
    }
    if (baseTokenSet != null && baseTokenSet.size() > 0) {
        allTypes.remove(baseTokenAnnotationType);
    }
    if (indexUnitSet != null && indexUnitSet.size() > 0)
        allTypes.remove(indexUnitAnnotationType);
    AnnotationSet toUseSet = new AnnotationSetImpl(document);
    for (String type : allTypes) {
        for (Annotation a : inputAs.get(type)) {
            try {
                toUseSet.add(a.getStartNode().getOffset(), a.getEndNode().getOffset(), a.getType(), a.getFeatures());
            } catch (InvalidOffsetException ioe) {
                throw new GateRuntimeException(ioe);
            }
        }
    }
    @SuppressWarnings({ "cast", "unchecked", "rawtypes" }) List<Token>[] toReturn = (List<Token>[]) new List[unitOffsetsSet.size()];
    Iterator<OffsetGroup> iter = unitOffsetsSet.iterator();
    int counter = 0;
    while (iter.hasNext()) {
        OffsetGroup group = iter.next();
        List<Token> newTokens = new ArrayList<Token>();
        List<Annotation> tokens = new ArrayList<Annotation>(toUseSet.getContained(group.startOffset, group.endOffset));
        // add tokens from the baseTokenSet
        if (baseTokenSet != null && baseTokenSet.size() != 0) {
            tokens.addAll(baseTokenSet.getContained(group.startOffset, group.endOffset));
        }
        if (tokens.isEmpty())
            return null;
        Collections.sort(tokens, new OffsetComparator());
        int position = -1;
        for (int i = 0; i < tokens.size(); i++) {
            byte inc = 1;
            Annotation annot = tokens.get(i);
            String type = annot.getType();
            // if the feature is specified in featuresToExclude -exclude it
            if (excludeFeatures && featuresToExclude.contains(type))
                continue;
            // exclude it
            if (includeFeatures && !featuresToInclude.contains(type))
                continue;
            int startOffset = annot.getStartNode().getOffset().intValue();
            int endOffset = annot.getEndNode().getOffset().intValue();
            String text = document.getContent().toString().substring(startOffset, endOffset);
            Token token1 = new Token(type, startOffset, endOffset, "*");
            // we add extra info of position
            if (i > 0) {
                if (annot.getStartNode().getOffset().longValue() == tokens.get(i - 1).getStartNode().getOffset().longValue()) {
                    token1.setPositionIncrement(0);
                    inc = 0;
                }
            }
            position += inc;
            token1.setPosition(position);
            newTokens.add(token1);
            if (!type.equals(baseTokenAnnotationType) || (annot.getFeatures().get("string") == null)) {
                // we need to create one string feature for this
                Token tk1 = new Token(text, startOffset, endOffset, type + ".string");
                indexedFeatures.add(type + ".string");
                tk1.setPositionIncrement(0);
                tk1.setPosition(position);
                newTokens.add(tk1);
            }
            // now find out the features and add them
            FeatureMap features = annot.getFeatures();
            Iterator<Object> fIter = features.keySet().iterator();
            while (fIter.hasNext()) {
                String type1 = fIter.next().toString();
                // it
                if (excludeFeatures && featuresToExclude.contains(type + "." + type1)) {
                    continue;
                }
                // exclude it
                if (includeFeatures && !featuresToInclude.contains(type + "." + type1))
                    continue;
                Object tempText = features.get(type1);
                if (tempText == null)
                    continue;
                String text1 = tempText.toString();
                // we need to qualify the type names
                // for each annotation type feature we add AT.Feature=="**" to be able
                // to search for it
                // to calculate stats
                Token tempToken = new Token(text1, startOffset, endOffset, type + "." + type1);
                indexedFeatures.add(type + "." + type1);
                tempToken.setPositionIncrement(0);
                tempToken.setPosition(position);
                newTokens.add(tempToken);
                Token onlyATFeature = new Token(type + "." + type1, startOffset, endOffset, "**");
                onlyATFeature.setPosition(position);
                onlyATFeature.setPositionIncrement(0);
                newTokens.add(onlyATFeature);
            }
        }
        toReturn[counter] = newTokens;
        counter++;
    }
    return toReturn;
}
Also used : ArrayList(java.util.ArrayList) AnnotationSet(gate.AnnotationSet) Token(gate.creole.annic.apache.lucene.analysis.Token) GateRuntimeException(gate.util.GateRuntimeException) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) InvalidOffsetException(gate.util.InvalidOffsetException) Annotation(gate.Annotation) FeatureMap(gate.FeatureMap) AnnotationSetImpl(gate.annotation.AnnotationSetImpl) OffsetComparator(gate.util.OffsetComparator)

Aggregations

Token (gate.creole.annic.apache.lucene.analysis.Token)2 Annotation (gate.Annotation)1 AnnotationSet (gate.AnnotationSet)1 FeatureMap (gate.FeatureMap)1 AnnotationSetImpl (gate.annotation.AnnotationSetImpl)1 TokenStream (gate.creole.annic.apache.lucene.analysis.TokenStream)1 Field (gate.creole.annic.apache.lucene.document.Field)1 GateRuntimeException (gate.util.GateRuntimeException)1 InvalidOffsetException (gate.util.InvalidOffsetException)1 OffsetComparator (gate.util.OffsetComparator)1 Reader (java.io.Reader)1 StringReader (java.io.StringReader)1 ArrayList (java.util.ArrayList)1 Enumeration (java.util.Enumeration)1 HashSet (java.util.HashSet)1 List (java.util.List)1