use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class GraphTokenStreamFiniteStrings method build.
/**
* Build an automaton from the provided {@link TokenStream}.
*/
private Automaton build(final TokenStream in) throws IOException {
Automaton.Builder builder = new Automaton.Builder();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
in.reset();
int pos = -1;
int prevIncr = 1;
int state = -1;
while (in.incrementToken()) {
int currentIncr = posIncAtt.getPositionIncrement();
if (pos == -1 && currentIncr < 1) {
throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
}
// always use inc 1 while building, but save original increment
int incr = Math.min(1, currentIncr);
if (incr > 0) {
pos += incr;
}
int endPos = pos + posLengthAtt.getPositionLength();
while (state < endPos) {
state = builder.createState();
}
BytesRef term = termBytesAtt.getBytesRef();
int id = getTermID(currentIncr, prevIncr, term);
builder.addTransition(pos, endPos, id);
// only save last increment on non-zero increment in case we have multiple stacked tokens
if (currentIncr > 0) {
prevIncr = currentIncr;
}
}
in.end();
if (state != -1) {
builder.setAccept(state, true);
}
return builder.finish();
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class TestPositionIncrement method testSetPosition.
public void testSetPosition() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new Tokenizer() {
// TODO: use CannedTokenStream
private final String[] TOKENS = { "1", "2", "3", "4", "5" };
private final int[] INCREMENTS = { 1, 2, 1, 0, 1 };
private int i = 0;
PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public boolean incrementToken() {
if (i == TOKENS.length)
return false;
clearAttributes();
termAtt.append(TOKENS[i]);
offsetAtt.setOffset(i, i);
posIncrAtt.setPositionIncrement(INCREMENTS[i]);
i++;
return true;
}
@Override
public void reset() throws IOException {
super.reset();
this.i = 0;
}
});
}
};
Directory store = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), store, analyzer);
Document d = new Document();
d.add(newTextField("field", "bogus", Field.Store.YES));
writer.addDocument(d);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(reader);
PostingsEnum pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("1"));
pos.nextDoc();
// first token should be at position 0
assertEquals(0, pos.nextPosition());
pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("2"));
pos.nextDoc();
// second token should be at position 2
assertEquals(2, pos.nextPosition());
PhraseQuery q;
ScoreDoc[] hits;
q = new PhraseQuery("field", "1", "2");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// same as previous, using the builder with implicit positions
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"));
builder.add(new Term("field", "2"));
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// same as previous, just specify positions explicitely.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"), 0);
builder.add(new Term("field", "2"), 1);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// specifying correct positions should find the phrase.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"), 0);
builder.add(new Term("field", "2"), 2);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "3");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "3", "4");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// phrase query would find it when correct positions are specified.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "3"), 0);
builder.add(new Term("field", "4"), 0);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
// phrase query should fail for non existing searched term
// even if there exist another searched terms in the same searched position.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "3"), 0);
builder.add(new Term("field", "9"), 0);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// multi-phrase query should succed for non existing searched term
// because there exist another searched terms in the same searched position.
MultiPhraseQuery.Builder mqb = new MultiPhraseQuery.Builder();
mqb.add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0);
hits = searcher.search(mqb.build(), 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "4");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "3", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "4", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
reader.close();
store.close();
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class TestDocumentWriter method testTokenReuse.
public void testTokenReuse() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new TokenFilter(tokenizer) {
boolean first = true;
AttributeSource.State state;
@Override
public boolean incrementToken() throws IOException {
if (state != null) {
restoreState(state);
payloadAtt.setPayload(null);
posIncrAtt.setPositionIncrement(0);
termAtt.setEmpty().append("b");
state = null;
return true;
}
boolean hasNext = input.incrementToken();
if (!hasNext)
return false;
if (Character.isDigit(termAtt.buffer()[0])) {
posIncrAtt.setPositionIncrement(termAtt.buffer()[0] - '0');
}
if (first) {
// set payload on first position only
payloadAtt.setPayload(new BytesRef(new byte[] { 100 }));
first = false;
}
// index a "synonym" for every token
state = captureState();
return true;
}
@Override
public void reset() throws IOException {
super.reset();
first = true;
state = null;
}
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
});
}
};
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
Document doc = new Document();
doc.add(newTextField("f1", "a 5 a a", Field.Store.YES));
writer.addDocument(doc);
writer.commit();
SegmentCommitInfo info = writer.newestSegment();
writer.close();
SegmentReader reader = new SegmentReader(info, Version.LATEST.major, newIOContext(random()));
PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "f1", new BytesRef("a"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
int freq = termPositions.freq();
assertEquals(3, freq);
assertEquals(0, termPositions.nextPosition());
assertNotNull(termPositions.getPayload());
assertEquals(6, termPositions.nextPosition());
assertNull(termPositions.getPayload());
assertEquals(7, termPositions.nextPosition());
assertNull(termPositions.getPayload());
reader.close();
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class TestTypeTokenFilter method testPositons.
private void testPositons(TypeTokenFilter stpf) throws IOException {
TypeAttribute typeAtt = stpf.getAttribute(TypeAttribute.class);
CharTermAttribute termAttribute = stpf.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
stpf.reset();
while (stpf.incrementToken()) {
log("Token: " + termAttribute.toString() + ": " + typeAtt.type() + " - " + posIncrAtt.getPositionIncrement());
assertEquals("if position increment is enabled the positionIncrementAttribute value should be 3, otherwise 1", posIncrAtt.getPositionIncrement(), 3);
}
stpf.end();
stpf.close();
}
use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.
the class TestTeeSinkTokenFilter method performance.
/**
* Not an explicit test, just useful to print out some info on performance
*/
@SuppressWarnings("resource")
public void performance() throws Exception {
int[] tokCount = { 100, 500, 1000, 2000, 5000, 10000 };
int[] modCounts = { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
for (int k = 0; k < tokCount.length; k++) {
StringBuilder buffer = new StringBuilder();
System.out.println("-----Tokens: " + tokCount[k] + "-----");
for (int i = 0; i < tokCount[k]; i++) {
buffer.append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).append(' ');
}
//make sure we produce the same tokens
TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer)));
TokenStream sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), 100);
teeStream.consumeAllTokens();
TokenStream stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), 100);
CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
for (int i = 0; stream.incrementToken(); i++) {
assertTrue(sink.incrementToken());
assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
}
//simulate two fields, each being analyzed once, for 20 documents
for (int j = 0; j < modCounts.length; j++) {
int tfPos = 0;
long start = System.currentTimeMillis();
for (int i = 0; i < 20; i++) {
stream = new StandardFilter(standardTokenizer(buffer));
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
tfPos += posIncrAtt.getPositionIncrement();
}
stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), modCounts[j]);
posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
while (stream.incrementToken()) {
tfPos += posIncrAtt.getPositionIncrement();
}
}
long finish = System.currentTimeMillis();
System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
int sinkPos = 0;
//simulate one field with one sink
start = System.currentTimeMillis();
for (int i = 0; i < 20; i++) {
teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer)));
sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), modCounts[j]);
PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
while (teeStream.incrementToken()) {
sinkPos += posIncrAtt.getPositionIncrement();
}
//System.out.println("Modulo--------");
posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class);
while (sink.incrementToken()) {
sinkPos += posIncrAtt.getPositionIncrement();
}
}
finish = System.currentTimeMillis();
System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
}
System.out.println("- End Tokens: " + tokCount[k] + "-----");
}
}
Aggregations