use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestRTGBase method getFirstMatch.
protected int getFirstMatch(IndexReader r, Term t) throws IOException {
Terms terms = MultiFields.getTerms(r, t.field());
if (terms == null)
return -1;
BytesRef termBytes = t.bytes();
final TermsEnum termsEnum = terms.iterator();
if (!termsEnum.seekExact(termBytes)) {
return -1;
}
PostingsEnum docs = termsEnum.postings(null, PostingsEnum.NONE);
docs = BitsFilteredPostingsEnum.wrap(docs, MultiFields.getLiveDocs(r));
int id = docs.nextDoc();
if (id != DocIdSetIterator.NO_MORE_DOCS) {
int next = docs.nextDoc();
assertEquals(DocIdSetIterator.NO_MORE_DOCS, next);
}
return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id;
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestPerfTasksLogic method testReadTokens.
/**
* Test ReadTokensTask
*/
public void testReadTokens() throws Exception {
// We will call ReadTokens on this many docs
final int NUM_DOCS = 20;
// Read tokens from first NUM_DOCS docs from Reuters and
// then build index from the same docs
String[] algLines1 = { "# ----- properties ", "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "# ----- alg ", "{ReadTokens}: " + NUM_DOCS, "ResetSystemErase", "CreateIndex", "{AddDoc}: " + NUM_DOCS, "CloseIndex" };
// Run algo
Benchmark benchmark = execBenchmark(algLines1);
List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();
// Count how many tokens all ReadTokens saw
int totalTokenCount1 = 0;
for (final TaskStats stat : stats) {
if (stat.getTask().getName().equals("ReadTokens")) {
totalTokenCount1 += stat.getCount();
}
}
// Separately count how many tokens are actually in the index:
IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
assertEquals(NUM_DOCS, reader.numDocs());
int totalTokenCount2 = 0;
Fields fields = MultiFields.getFields(reader);
for (String fieldName : fields) {
if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
continue;
}
Terms terms = fields.terms(fieldName);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
while (termsEnum.next() != null) {
docs = TestUtil.docs(random(), termsEnum, docs, PostingsEnum.FREQS);
while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
totalTokenCount2 += docs.freq();
}
}
}
reader.close();
// Make sure they are the same
assertEquals(totalTokenCount1, totalTokenCount2);
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestDocTermOrds method verify.
private void verify(LeafReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) throws Exception {
final DocTermOrds dto = new DocTermOrds(r, r.getLiveDocs(), "field", prefixRef, Integer.MAX_VALUE, TestUtil.nextInt(random(), 2, 10));
final NumericDocValues docIDToID = FieldCache.DEFAULT.getNumerics(r, "id", FieldCache.LEGACY_INT_PARSER);
if (VERBOSE) {
System.out.println("TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.utf8ToString()));
System.out.println("TEST: all TERMS:");
TermsEnum allTE = MultiFields.getTerms(r, "field").iterator();
int ord = 0;
while (allTE.next() != null) {
System.out.println(" ord=" + (ord++) + " term=" + allTE.term().utf8ToString());
}
}
//final TermsEnum te = subR.fields().terms("field").iterator();
final TermsEnum te = dto.getOrdTermsEnum(r);
if (dto.numTerms() == 0) {
if (prefixRef == null) {
assertNull(MultiFields.getTerms(r, "field"));
} else {
Terms terms = MultiFields.getTerms(r, "field");
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
TermsEnum.SeekStatus result = termsEnum.seekCeil(prefixRef);
if (result != TermsEnum.SeekStatus.END) {
assertFalse("term=" + termsEnum.term().utf8ToString() + " matches prefix=" + prefixRef.utf8ToString(), StringHelper.startsWith(termsEnum.term(), prefixRef));
} else {
// ok
}
} else {
// ok
}
}
return;
}
if (VERBOSE) {
System.out.println("TEST: TERMS:");
te.seekExact(0);
while (true) {
System.out.println(" ord=" + te.ord() + " term=" + te.term().utf8ToString());
if (te.next() == null) {
break;
}
}
}
SortedSetDocValues iter = dto.iterator(r);
for (int docID = 0; docID < r.maxDoc(); docID++) {
assertEquals(docID, docIDToID.nextDoc());
if (docID > iter.docID()) {
iter.nextDoc();
}
if (docID < iter.docID()) {
int[] answers = idToOrds[(int) docIDToID.longValue()];
assertEquals(0, answers.length);
continue;
}
if (VERBOSE) {
System.out.println("TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.longValue() + ")");
}
final int[] answers = idToOrds[(int) docIDToID.longValue()];
int upto = 0;
long ord;
while ((ord = iter.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
te.seekExact(ord);
final BytesRef expected = termsArray[answers[upto++]];
if (VERBOSE) {
System.out.println(" exp=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString());
}
assertEquals("expected=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString() + " ord=" + ord, expected, te.term());
}
assertEquals(answers.length, upto);
}
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestFaceting method doTermEnum.
void doTermEnum(int size) throws Exception {
//System.out.println("doTermEnum size=" + size);
close();
createIndex(size);
req = lrf.makeRequest("q", "*:*");
SortedSetDocValues dv = DocValues.getSortedSet(req.getSearcher().getSlowAtomicReader(), proto.field());
assertEquals(size, dv.getValueCount());
TermsEnum te = dv.termsEnum();
Random r = new Random(size);
// test seeking by term string
for (int i = 0; i < size * 2 + 10; i++) {
int rnum = r.nextInt(size + 2);
String s = t(rnum);
//System.out.println("s=" + s);
final BytesRef br;
if (te == null) {
br = null;
} else {
TermsEnum.SeekStatus status = te.seekCeil(new BytesRef(s));
if (status == TermsEnum.SeekStatus.END) {
br = null;
} else {
br = te.term();
}
}
assertEquals(br != null, rnum < size);
if (rnum < size) {
assertEquals(rnum, (int) te.ord());
assertEquals(s, te.term().utf8ToString());
}
}
// test seeking before term
if (size > 0) {
assertEquals(size > 0, te.seekCeil(new BytesRef("000")) != TermsEnum.SeekStatus.END);
assertEquals(0, te.ord());
assertEquals(t(0), te.term().utf8ToString());
}
if (size > 0) {
// test seeking by term number
for (int i = 0; i < size * 2 + 10; i++) {
int rnum = r.nextInt(size);
String s = t(rnum);
te.seekExact((long) rnum);
BytesRef br = te.term();
assertNotNull(br);
assertEquals(rnum, (int) te.ord());
assertEquals(s, te.term().utf8ToString());
}
}
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestTeeSinkTokenFilter method testEndOffsetPositionWithTeeSinkTokenFilter.
// LUCENE-1448
// TODO: instead of testing it this way, we can test
// with BaseTokenStreamTestCase now...
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
Document doc = new Document();
TokenStream tokenStream = analyzer.tokenStream("field", "abcd ");
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
TokenStream sink = tee.newSinkTokenStream();
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
Field f1 = new Field("field", tee, ft);
Field f2 = new Field("field", sink, ft);
doc.add(f1);
doc.add(f2);
w.addDocument(doc);
w.close();
IndexReader r = DirectoryReader.open(dir);
Terms vector = r.getTermVectors(0).terms("field");
assertEquals(1, vector.size());
TermsEnum termsEnum = vector.iterator();
termsEnum.next();
assertEquals(2, termsEnum.totalTermFreq());
PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
assertEquals(2, positions.freq());
positions.nextPosition();
assertEquals(0, positions.startOffset());
assertEquals(4, positions.endOffset());
positions.nextPosition();
assertEquals(8, positions.startOffset());
assertEquals(12, positions.endOffset());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
r.close();
dir.close();
analyzer.close();
}
Aggregations