Search in sources :

Example 1 with TokenStream

use of gate.creole.annic.apache.lucene.analysis.TokenStream in project gate-core by GateNLP.

the class Posting method invertDocument.

// Tokenizes the fields of a document into Postings.
private final void invertDocument(Document doc) throws IOException {
    Enumeration fields = doc.fields();
    while (fields.hasMoreElements()) {
        Field field = (Field) fields.nextElement();
        String fieldName = field.name();
        int fieldNumber = fieldInfos.fieldNumber(fieldName);
        // length of field
        int length = fieldLengths[fieldNumber];
        // position in field
        int position = fieldPositions[fieldNumber];
        if (field.isIndexed()) {
            if (!field.isTokenized()) {
                // un-tokenized field
                addPosition(fieldName, field.stringValue(), "Field", /*, 1*/
                position++);
                length++;
            } else {
                // find or make Reader
                Reader reader;
                if (field.readerValue() != null)
                    reader = field.readerValue();
                else if (field.stringValue() != null)
                    reader = new StringReader(field.stringValue());
                else
                    throw new IllegalArgumentException("field must have either String or Reader value");
                // Tokenize field and add to postingTable
                TokenStream stream = analyzer.tokenStream(fieldName, reader);
                try {
                    for (Token t = stream.next(); t != null; t = stream.next()) {
                        position += (t.getPositionIncrement() - 1);
                        if (t.type() == null)
                            addPosition(fieldName, t.termText(), "*", /*, t.getPositionIncrement()*/
                            position++);
                        else
                            addPosition(fieldName, t.termText(), t.type(), /*, t.getPositionIncrement()*/
                            position++);
                        if (++length > maxFieldLength)
                            break;
                    }
                } finally {
                    stream.close();
                }
            }
            // save field length
            fieldLengths[fieldNumber] = length;
            // save field position
            fieldPositions[fieldNumber] = position;
            fieldBoosts[fieldNumber] *= field.getBoost();
        }
    }
}
Also used : Field(gate.creole.annic.apache.lucene.document.Field) TokenStream(gate.creole.annic.apache.lucene.analysis.TokenStream) Enumeration(java.util.Enumeration) StringReader(java.io.StringReader) Reader(java.io.Reader) StringReader(java.io.StringReader) Token(gate.creole.annic.apache.lucene.analysis.Token)

Aggregations

Token (gate.creole.annic.apache.lucene.analysis.Token)1 TokenStream (gate.creole.annic.apache.lucene.analysis.TokenStream)1 Field (gate.creole.annic.apache.lucene.document.Field)1 Reader (java.io.Reader)1 StringReader (java.io.StringReader)1 Enumeration (java.util.Enumeration)1