use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestMultiPhraseEnum method testOneDocument.
/** Tests union on one document */
public void testOneDocument() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setMergePolicy(newLogMergePolicy());
IndexWriter writer = new IndexWriter(dir, iwc);
Document doc = new Document();
doc.add(new TextField("field", "foo bar", Field.Store.NO));
writer.addDocument(doc);
DirectoryReader ir = DirectoryReader.open(writer);
writer.close();
PostingsEnum p1 = getOnlyLeafReader(ir).postings(new Term("field", "foo"), PostingsEnum.POSITIONS);
PostingsEnum p2 = getOnlyLeafReader(ir).postings(new Term("field", "bar"), PostingsEnum.POSITIONS);
PostingsEnum union = new MultiPhraseQuery.UnionPostingsEnum(Arrays.asList(p1, p2));
assertEquals(-1, union.docID());
assertEquals(0, union.nextDoc());
assertEquals(2, union.freq());
assertEquals(0, union.nextPosition());
assertEquals(1, union.nextPosition());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, union.nextDoc());
ir.close();
dir.close();
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestMultiPhraseEnum method testSomeDocuments.
/** Tests union on a few documents */
public void testSomeDocuments() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig();
iwc.setMergePolicy(newLogMergePolicy());
IndexWriter writer = new IndexWriter(dir, iwc);
Document doc = new Document();
doc.add(new TextField("field", "foo", Field.Store.NO));
writer.addDocument(doc);
writer.addDocument(new Document());
doc = new Document();
doc.add(new TextField("field", "foo bar", Field.Store.NO));
writer.addDocument(doc);
doc = new Document();
doc.add(new TextField("field", "bar", Field.Store.NO));
writer.addDocument(doc);
writer.forceMerge(1);
DirectoryReader ir = DirectoryReader.open(writer);
writer.close();
PostingsEnum p1 = getOnlyLeafReader(ir).postings(new Term("field", "foo"), PostingsEnum.POSITIONS);
PostingsEnum p2 = getOnlyLeafReader(ir).postings(new Term("field", "bar"), PostingsEnum.POSITIONS);
PostingsEnum union = new MultiPhraseQuery.UnionPostingsEnum(Arrays.asList(p1, p2));
assertEquals(-1, union.docID());
assertEquals(0, union.nextDoc());
assertEquals(1, union.freq());
assertEquals(0, union.nextPosition());
assertEquals(2, union.nextDoc());
assertEquals(2, union.freq());
assertEquals(0, union.nextPosition());
assertEquals(1, union.nextPosition());
assertEquals(3, union.nextDoc());
assertEquals(1, union.freq());
assertEquals(0, union.nextPosition());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, union.nextDoc());
ir.close();
dir.close();
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestCachingTokenFilter method testCaching.
public void testCaching() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
AtomicInteger resetCount = new AtomicInteger(0);
TokenStream stream = new TokenStream() {
private int index = 0;
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public void reset() throws IOException {
super.reset();
resetCount.incrementAndGet();
}
@Override
public boolean incrementToken() {
if (index == tokens.length) {
return false;
} else {
clearAttributes();
termAtt.append(tokens[index++]);
offsetAtt.setOffset(0, 0);
return true;
}
}
};
stream = new CachingTokenFilter(stream);
doc.add(new TextField("preanalyzed", stream));
// 1) we consume all tokens twice before we add the doc to the index
assertFalse(((CachingTokenFilter) stream).isCached());
stream.reset();
assertFalse(((CachingTokenFilter) stream).isCached());
checkTokens(stream);
stream.reset();
checkTokens(stream);
assertTrue(((CachingTokenFilter) stream).isCached());
// 2) now add the document to the index and verify if all tokens are indexed
// don't reset the stream here, the DocumentWriter should do that implicitly
writer.addDocument(doc);
IndexReader reader = writer.getReader();
PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term1"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term2"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
assertEquals(3, termPositions.nextPosition());
termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term3"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());
reader.close();
writer.close();
// 3) reset stream and consume tokens again
stream.reset();
checkTokens(stream);
assertEquals(1, resetCount.get());
dir.close();
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestPositionIncrement method testSetPosition.
public void testSetPosition() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new Tokenizer() {
// TODO: use CannedTokenStream
private final String[] TOKENS = { "1", "2", "3", "4", "5" };
private final int[] INCREMENTS = { 1, 2, 1, 0, 1 };
private int i = 0;
PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public boolean incrementToken() {
if (i == TOKENS.length)
return false;
clearAttributes();
termAtt.append(TOKENS[i]);
offsetAtt.setOffset(i, i);
posIncrAtt.setPositionIncrement(INCREMENTS[i]);
i++;
return true;
}
@Override
public void reset() throws IOException {
super.reset();
this.i = 0;
}
});
}
};
Directory store = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), store, analyzer);
Document d = new Document();
d.add(newTextField("field", "bogus", Field.Store.YES));
writer.addDocument(d);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(reader);
PostingsEnum pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("1"));
pos.nextDoc();
// first token should be at position 0
assertEquals(0, pos.nextPosition());
pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("2"));
pos.nextDoc();
// second token should be at position 2
assertEquals(2, pos.nextPosition());
PhraseQuery q;
ScoreDoc[] hits;
q = new PhraseQuery("field", "1", "2");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// same as previous, using the builder with implicit positions
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"));
builder.add(new Term("field", "2"));
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// same as previous, just specify positions explicitely.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"), 0);
builder.add(new Term("field", "2"), 1);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// specifying correct positions should find the phrase.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"), 0);
builder.add(new Term("field", "2"), 2);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "3");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "3", "4");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// phrase query would find it when correct positions are specified.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "3"), 0);
builder.add(new Term("field", "4"), 0);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
// phrase query should fail for non existing searched term
// even if there exist another searched terms in the same searched position.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "3"), 0);
builder.add(new Term("field", "9"), 0);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// multi-phrase query should succed for non existing searched term
// because there exist another searched terms in the same searched position.
MultiPhraseQuery.Builder mqb = new MultiPhraseQuery.Builder();
mqb.add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0);
hits = searcher.search(mqb.build(), 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "4");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "3", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "4", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
reader.close();
store.close();
}
use of org.apache.lucene.index.PostingsEnum in project textdb by TextDB.
the class DataReader method buildPayloadFromTermVector.
private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
ArrayList<Span> payloadSpanList = new ArrayList<>();
for (Attribute attr : inputSchema.getAttributes()) {
String attributeName = attr.getName();
AttributeType attributeType = attr.getType();
// payload.
if (attributeType != AttributeType.TEXT) {
continue;
}
String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
if (termVector == null) {
continue;
}
TermsEnum termsEnum = termVector.iterator();
PostingsEnum termPostings = null;
// go through document terms
while ((termsEnum.next()) != null) {
termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
continue;
}
// for each term, go through its postings
for (int i = 0; i < termPostings.freq(); i++) {
// nextPosition needs to be called first
int tokenPosition = termPostings.nextPosition();
int charStart = termPostings.startOffset();
int charEnd = termPostings.endOffset();
String analyzedTermStr = termsEnum.term().utf8ToString();
String originalTermStr = fieldValue.substring(charStart, charEnd);
Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
payloadSpanList.add(span);
}
}
}
return payloadSpanList;
}
Aggregations