use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class TestEmptyTokenStream method testIndexWriter_LUCENE4656.
public void testIndexWriter_LUCENE4656() throws IOException {
Directory directory = newDirectory();
IndexWriter writer = new IndexWriter(directory, newIndexWriterConfig(null));
TokenStream ts = new EmptyTokenStream();
assertFalse(ts.hasAttribute(TermToBytesRefAttribute.class));
Document doc = new Document();
doc.add(new StringField("id", "0", Field.Store.YES));
doc.add(new TextField("description", ts));
// this should not fail because we have no TermToBytesRefAttribute
writer.addDocument(doc);
assertEquals(1, writer.numDocs());
writer.close();
directory.close();
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class GraphTokenStreamFiniteStrings method build.
/**
* Build an automaton from the provided {@link TokenStream}.
*/
private Automaton build(final TokenStream in) throws IOException {
Automaton.Builder builder = new Automaton.Builder();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
in.reset();
int pos = -1;
int prevIncr = 1;
int state = -1;
while (in.incrementToken()) {
int currentIncr = posIncAtt.getPositionIncrement();
if (pos == -1 && currentIncr < 1) {
throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
}
// always use inc 1 while building, but save original increment
int incr = Math.min(1, currentIncr);
if (incr > 0) {
pos += incr;
}
int endPos = pos + posLengthAtt.getPositionLength();
while (state < endPos) {
state = builder.createState();
}
BytesRef term = termBytesAtt.getBytesRef();
int id = getTermID(currentIncr, prevIncr, term);
builder.addTransition(pos, endPos, id);
// only save last increment on non-zero increment in case we have multiple stacked tokens
if (currentIncr > 0) {
prevIncr = currentIncr;
}
}
in.end();
if (state != -1) {
builder.setAccept(state, true);
}
return builder.finish();
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class ReadTokensTask method doLogic.
@Override
public int doLogic() throws Exception {
List<IndexableField> fields = doc.getFields();
Analyzer analyzer = getRunData().getAnalyzer();
int tokenCount = 0;
for (final IndexableField field : fields) {
if (field.fieldType().indexOptions() == IndexOptions.NONE || field.fieldType().tokenized() == false) {
continue;
}
final TokenStream stream = field.tokenStream(analyzer, null);
// reset the TokenStream to the first token
stream.reset();
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
while (stream.incrementToken()) {
termAtt.getBytesRef();
tokenCount++;
}
stream.end();
stream.close();
}
totalTokenCount += tokenCount;
return tokenCount;
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class TestLongPostings method getRandomTerm.
// Produces a realistic unicode random string that
// survives MockAnalyzer unchanged:
private String getRandomTerm(String other) throws IOException {
Analyzer a = new MockAnalyzer(random());
while (true) {
String s = TestUtil.randomRealisticUnicodeString(random());
if (other != null && s.equals(other)) {
continue;
}
try (TokenStream ts = a.tokenStream("foo", s)) {
final TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
ts.reset();
int count = 0;
boolean changed = false;
while (ts.incrementToken()) {
final BytesRef termBytes = termAtt.getBytesRef();
if (count == 0 && !termBytes.utf8ToString().equals(s)) {
// The value was changed during analysis. Keep iterating so the
// tokenStream is exhausted.
changed = true;
}
count++;
}
ts.end();
// Did we iterate just once and the value was unchanged?
if (!changed && count == 1) {
return s;
}
}
}
}
use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.
the class QueryBuilder method analyzeMultiPhrase.
/**
* Creates complex phrase query from the cached tokenstream contents
*/
protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) throws IOException {
MultiPhraseQuery.Builder mpqb = newMultiPhraseQueryBuilder();
mpqb.setSlop(slop);
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
int position = -1;
List<Term> multiTerms = new ArrayList<>();
stream.reset();
while (stream.incrementToken()) {
int positionIncrement = posIncrAtt.getPositionIncrement();
if (positionIncrement > 0 && multiTerms.size() > 0) {
if (enablePositionIncrements) {
mpqb.add(multiTerms.toArray(new Term[0]), position);
} else {
mpqb.add(multiTerms.toArray(new Term[0]));
}
multiTerms.clear();
}
position += positionIncrement;
multiTerms.add(new Term(field, termAtt.getBytesRef()));
}
if (enablePositionIncrements) {
mpqb.add(multiTerms.toArray(new Term[0]), position);
} else {
mpqb.add(multiTerms.toArray(new Term[0]));
}
return mpqb.build();
}
Aggregations