use of gate.creole.annic.apache.lucene.analysis.TokenStream in project gate-core by GateNLP.
the class Posting method invertDocument.
// Tokenizes the fields of a document into Postings.
private final void invertDocument(Document doc) throws IOException {
Enumeration fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement();
String fieldName = field.name();
int fieldNumber = fieldInfos.fieldNumber(fieldName);
// length of field
int length = fieldLengths[fieldNumber];
// position in field
int position = fieldPositions[fieldNumber];
if (field.isIndexed()) {
if (!field.isTokenized()) {
// un-tokenized field
addPosition(fieldName, field.stringValue(), "Field", /*, 1*/
position++);
length++;
} else {
// find or make Reader
Reader reader;
if (field.readerValue() != null)
reader = field.readerValue();
else if (field.stringValue() != null)
reader = new StringReader(field.stringValue());
else
throw new IllegalArgumentException("field must have either String or Reader value");
// Tokenize field and add to postingTable
TokenStream stream = analyzer.tokenStream(fieldName, reader);
try {
for (Token t = stream.next(); t != null; t = stream.next()) {
position += (t.getPositionIncrement() - 1);
if (t.type() == null)
addPosition(fieldName, t.termText(), "*", /*, t.getPositionIncrement()*/
position++);
else
addPosition(fieldName, t.termText(), t.type(), /*, t.getPositionIncrement()*/
position++);
if (++length > maxFieldLength)
break;
}
} finally {
stream.close();
}
}
// save field length
fieldLengths[fieldNumber] = length;
// save field position
fieldPositions[fieldNumber] = position;
fieldBoosts[fieldNumber] *= field.getBoost();
}
}
}
Aggregations