use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestCachingTokenFilter method testCaching.
public void testCaching() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
AtomicInteger resetCount = new AtomicInteger(0);
TokenStream stream = new TokenStream() {
private int index = 0;
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public void reset() throws IOException {
super.reset();
resetCount.incrementAndGet();
}
@Override
public boolean incrementToken() {
if (index == tokens.length) {
return false;
} else {
clearAttributes();
termAtt.append(tokens[index++]);
offsetAtt.setOffset(0, 0);
return true;
}
}
};
stream = new CachingTokenFilter(stream);
doc.add(new TextField("preanalyzed", stream));
// 1) we consume all tokens twice before we add the doc to the index
assertFalse(((CachingTokenFilter) stream).isCached());
stream.reset();
assertFalse(((CachingTokenFilter) stream).isCached());
checkTokens(stream);
stream.reset();
checkTokens(stream);
assertTrue(((CachingTokenFilter) stream).isCached());
// 2) now add the document to the index and verify if all tokens are indexed
// don't reset the stream here, the DocumentWriter should do that implicitly
writer.addDocument(doc);
IndexReader reader = writer.getReader();
PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term1"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term2"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
assertEquals(3, termPositions.nextPosition());
termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term3"));
assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());
reader.close();
writer.close();
// 3) reset stream and consume tokens again
stream.reset();
checkTokens(stream);
assertEquals(1, resetCount.get());
dir.close();
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestPositionIncrement method testSetPosition.
public void testSetPosition() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new Tokenizer() {
// TODO: use CannedTokenStream
private final String[] TOKENS = { "1", "2", "3", "4", "5" };
private final int[] INCREMENTS = { 1, 2, 1, 0, 1 };
private int i = 0;
PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@Override
public boolean incrementToken() {
if (i == TOKENS.length)
return false;
clearAttributes();
termAtt.append(TOKENS[i]);
offsetAtt.setOffset(i, i);
posIncrAtt.setPositionIncrement(INCREMENTS[i]);
i++;
return true;
}
@Override
public void reset() throws IOException {
super.reset();
this.i = 0;
}
});
}
};
Directory store = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), store, analyzer);
Document d = new Document();
d.add(newTextField("field", "bogus", Field.Store.YES));
writer.addDocument(d);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(reader);
PostingsEnum pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("1"));
pos.nextDoc();
// first token should be at position 0
assertEquals(0, pos.nextPosition());
pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("2"));
pos.nextDoc();
// second token should be at position 2
assertEquals(2, pos.nextPosition());
PhraseQuery q;
ScoreDoc[] hits;
q = new PhraseQuery("field", "1", "2");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// same as previous, using the builder with implicit positions
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"));
builder.add(new Term("field", "2"));
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// same as previous, just specify positions explicitely.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"), 0);
builder.add(new Term("field", "2"), 1);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// specifying correct positions should find the phrase.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "1"), 0);
builder.add(new Term("field", "2"), 2);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "3");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "3", "4");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// phrase query would find it when correct positions are specified.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "3"), 0);
builder.add(new Term("field", "4"), 0);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
// phrase query should fail for non existing searched term
// even if there exist another searched terms in the same searched position.
builder = new PhraseQuery.Builder();
builder.add(new Term("field", "3"), 0);
builder.add(new Term("field", "9"), 0);
q = builder.build();
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
// multi-phrase query should succed for non existing searched term
// because there exist another searched terms in the same searched position.
MultiPhraseQuery.Builder mqb = new MultiPhraseQuery.Builder();
mqb.add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0);
hits = searcher.search(mqb.build(), 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "4");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "3", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "4", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(1, hits.length);
q = new PhraseQuery("field", "2", "5");
hits = searcher.search(q, 1000).scoreDocs;
assertEquals(0, hits.length);
reader.close();
store.close();
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestKeywordAnalyzer method testMutipleDocument.
/*
public void testPerFieldAnalyzer() throws Exception {
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer());
analyzer.addAnalyzer("partnum", new KeywordAnalyzer());
QueryParser queryParser = new QueryParser("description", analyzer);
Query query = queryParser.parse("partnum:Q36 AND SPACE");
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("Q36 kept as-is",
"+partnum:Q36 +space", query.toString("description"));
assertEquals("doc found!", 1, hits.length);
}
*/
public void testMutipleDocument() throws Exception {
RAMDirectory dir = new RAMDirectory();
Analyzer analyzer = new KeywordAnalyzer();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer));
Document doc = new Document();
doc.add(new TextField("partnum", "Q36", Field.Store.YES));
writer.addDocument(doc);
doc = new Document();
doc.add(new TextField("partnum", "Q37", Field.Store.YES));
writer.addDocument(doc);
writer.close();
IndexReader reader = DirectoryReader.open(dir);
PostingsEnum td = TestUtil.docs(random(), reader, "partnum", new BytesRef("Q36"), null, 0);
assertTrue(td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
td = TestUtil.docs(random(), reader, "partnum", new BytesRef("Q37"), null, 0);
assertTrue(td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
analyzer.close();
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestTeeSinkTokenFilter method testEndOffsetPositionWithTeeSinkTokenFilter.
// LUCENE-1448
// TODO: instead of testing it this way, we can test
// with BaseTokenStreamTestCase now...
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
Document doc = new Document();
TokenStream tokenStream = analyzer.tokenStream("field", "abcd ");
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
TokenStream sink = tee.newSinkTokenStream();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
Field f1 = new Field("field", tee, ft);
Field f2 = new Field("field", sink, ft);
doc.add(f1);
doc.add(f2);
w.addDocument(doc);
w.close();
IndexReader r = DirectoryReader.open(dir);
Terms vector = r.getTermVectors(0).terms("field");
assertEquals(1, vector.size());
TermsEnum termsEnum = vector.iterator();
termsEnum.next();
assertEquals(2, termsEnum.totalTermFreq());
PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, positions.freq());
positions.nextPosition();
assertEquals(0, positions.startOffset());
assertEquals(4, positions.endOffset());
positions.nextPosition();
assertEquals(8, positions.startOffset());
assertEquals(12, positions.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
r.close();
dir.close();
analyzer.close();
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestPerfTasksLogic method testReadTokens.
/**
* Test ReadTokensTask
*/
public void testReadTokens() throws Exception {
// We will call ReadTokens on this many docs
final int NUM_DOCS = 20;
// Read tokens from first NUM_DOCS docs from Reuters and
// then build index from the same docs
String[] algLines1 = { "# ----- properties ", "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "# ----- alg ", "{ReadTokens}: " + NUM_DOCS, "ResetSystemErase", "CreateIndex", "{AddDoc}: " + NUM_DOCS, "CloseIndex" };
// Run algo
Benchmark benchmark = execBenchmark(algLines1);
List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();
// Count how many tokens all ReadTokens saw
int totalTokenCount1 = 0;
for (final TaskStats stat : stats) {
if (stat.getTask().getName().equals("ReadTokens")) {
totalTokenCount1 += stat.getCount();
}
}
// Separately count how many tokens are actually in the index:
IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
assertEquals(NUM_DOCS, reader.numDocs());
int totalTokenCount2 = 0;
Fields fields = MultiFields.getFields(reader);
for (String fieldName : fields) {
if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
continue;
}
Terms terms = fields.terms(fieldName);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while (termsEnum.next() != null) {
docs = TestUtil.docs(random(), termsEnum, docs, PostingsEnum.FREQS);
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
totalTokenCount2 += docs.freq();
}
}
}
reader.close();
// Make sure they are the same
assertEquals(totalTokenCount1, totalTokenCount2);
}
Aggregations