use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class LukeRequestHandler method getDetailedFieldInfo.
// Get terribly detailed information about a particular field. This is a very expensive call, use it with caution
// especially on large indexes!
@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap) throws IOException {
SolrParams params = req.getParams();
final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);
// Something to collect the top N terms in.
TopTermQueue tiq = new TopTermQueue(numTerms + 1);
final CharsRefBuilder spare = new CharsRefBuilder();
Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(), field);
if (terms == null) {
// field does not exist
return;
}
TermsEnum termsEnum = terms.iterator();
BytesRef text;
int[] buckets = new int[HIST_ARRAY_SIZE];
while ((text = termsEnum.next()) != null) {
++tiq.distinctTerms;
// This calculation seems odd, but it gives the same results as it used to.
int freq = termsEnum.docFreq();
int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
buckets[slot] = buckets[slot] + 1;
if (numTerms > 0 && freq > tiq.minFreq) {
spare.copyUTF8Bytes(text);
String t = spare.toString();
tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
if (tiq.size() > numTerms) {
// if tiq full
// remove lowest in tiq
tiq.pop();
tiq.minFreq = tiq.getTopTermInfo().docFreq;
}
}
}
tiq.histogram.add(buckets);
fieldMap.add("distinct", tiq.distinctTerms);
// Include top terms
fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));
// Add a histogram
fieldMap.add("histogram", tiq.histogram.toNamedList());
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class LukeRequestHandler method getFirstLiveDoc.
// Just get a document with the term in it, the first one will do!
// Is there a better way to do this? Shouldn't actually be very costly
// to do it this way.
private static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException {
PostingsEnum postingsEnum = null;
TermsEnum termsEnum = terms.iterator();
BytesRef text;
// Deal with the chance that the first bunch of terms are in deleted documents. Is there a better way?
for (int idx = 0; idx < 1000 && postingsEnum == null; ++idx) {
text = termsEnum.next();
if (text == null) {
// Ran off the end of the terms enum without finding any live docs with that field in them.
return null;
}
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
final Bits liveDocs = reader.getLiveDocs();
if (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (liveDocs != null && liveDocs.get(postingsEnum.docID())) {
continue;
}
return reader.document(postingsEnum.docID());
}
}
return null;
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestFSTs method testPrimaryKeys.
public void testPrimaryKeys() throws Exception {
Directory dir = newDirectory();
for (int cycle = 0; cycle < 2; cycle++) {
if (VERBOSE) {
System.out.println("TEST: cycle=" + cycle);
}
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
Document doc = new Document();
Field idField = newStringField("id", "", Field.Store.NO);
doc.add(idField);
final int NUM_IDS = atLeast(200);
//final int NUM_IDS = (int) (377 * (1.0+random.nextDouble()));
if (VERBOSE) {
System.out.println("TEST: NUM_IDS=" + NUM_IDS);
}
final Set<String> allIDs = new HashSet<>();
for (int id = 0; id < NUM_IDS; id++) {
String idString;
if (cycle == 0) {
// PKs are assigned sequentially
idString = String.format(Locale.ROOT, "%07d", id);
} else {
while (true) {
final String s = Long.toString(random().nextLong());
if (!allIDs.contains(s)) {
idString = s;
break;
}
}
}
allIDs.add(idString);
idField.setStringValue(idString);
w.addDocument(doc);
}
//w.forceMerge(1);
// turn writer into reader:
final IndexReader r = w.getReader();
final IndexSearcher s = newSearcher(r);
w.close();
final List<String> allIDsList = new ArrayList<>(allIDs);
final List<String> sortedAllIDsList = new ArrayList<>(allIDsList);
Collections.sort(sortedAllIDsList);
// Sprinkle in some non-existent PKs:
Set<String> outOfBounds = new HashSet<>();
for (int idx = 0; idx < NUM_IDS / 10; idx++) {
String idString;
if (cycle == 0) {
idString = String.format(Locale.ROOT, "%07d", (NUM_IDS + idx));
} else {
while (true) {
idString = Long.toString(random().nextLong());
if (!allIDs.contains(idString)) {
break;
}
}
}
outOfBounds.add(idString);
allIDsList.add(idString);
}
// Verify w/ TermQuery
for (int iter = 0; iter < 2 * NUM_IDS; iter++) {
final String id = allIDsList.get(random().nextInt(allIDsList.size()));
final boolean exists = !outOfBounds.contains(id);
if (VERBOSE) {
System.out.println("TEST: TermQuery " + (exists ? "" : "non-exist ") + " id=" + id);
}
assertEquals((exists ? "" : "non-exist ") + "id=" + id, exists ? 1 : 0, s.search(new TermQuery(new Term("id", id)), 1).totalHits);
}
// Verify w/ MultiTermsEnum
final TermsEnum termsEnum = MultiFields.getTerms(r, "id").iterator();
for (int iter = 0; iter < 2 * NUM_IDS; iter++) {
final String id;
final String nextID;
final boolean exists;
if (random().nextBoolean()) {
id = allIDsList.get(random().nextInt(allIDsList.size()));
exists = !outOfBounds.contains(id);
nextID = null;
if (VERBOSE) {
System.out.println("TEST: exactOnly " + (exists ? "" : "non-exist ") + "id=" + id);
}
} else {
// Pick ID between two IDs:
exists = false;
final int idv = random().nextInt(NUM_IDS - 1);
if (cycle == 0) {
id = String.format(Locale.ROOT, "%07da", idv);
nextID = String.format(Locale.ROOT, "%07d", idv + 1);
} else {
id = sortedAllIDsList.get(idv) + "a";
nextID = sortedAllIDsList.get(idv + 1);
}
if (VERBOSE) {
System.out.println("TEST: not exactOnly id=" + id + " nextID=" + nextID);
}
}
final TermsEnum.SeekStatus status;
if (nextID == null) {
if (termsEnum.seekExact(new BytesRef(id))) {
status = TermsEnum.SeekStatus.FOUND;
} else {
status = TermsEnum.SeekStatus.NOT_FOUND;
}
} else {
status = termsEnum.seekCeil(new BytesRef(id));
}
if (nextID != null) {
assertEquals(TermsEnum.SeekStatus.NOT_FOUND, status);
assertEquals("expected=" + nextID + " actual=" + termsEnum.term().utf8ToString(), new BytesRef(nextID), termsEnum.term());
} else if (!exists) {
assertTrue(status == TermsEnum.SeekStatus.NOT_FOUND || status == TermsEnum.SeekStatus.END);
} else {
assertEquals(TermsEnum.SeekStatus.FOUND, status);
}
}
r.close();
}
dir.close();
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestFSTs method testRealTerms.
// Build FST for all unique terms in the test line docs
// file, up until a doc limit
public void testRealTerms() throws Exception {
final LineFileDocs docs = new LineFileDocs(random());
final int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
MockAnalyzer analyzer = new MockAnalyzer(random());
analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
final Path tempDir = createTempDir("fstlines");
final Directory dir = newFSDirectory(tempDir);
final IndexWriter writer = new IndexWriter(dir, conf);
Document doc;
int docCount = 0;
while ((doc = docs.nextDoc()) != null && docCount < numDocs) {
writer.addDocument(doc);
docCount++;
}
IndexReader r = DirectoryReader.open(writer);
writer.close();
final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
boolean storeOrd = random().nextBoolean();
if (VERBOSE) {
if (storeOrd) {
System.out.println("FST stores ord");
} else {
System.out.println("FST stores docFreq");
}
}
Terms terms = MultiFields.getTerms(r, "body");
if (terms != null) {
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
final TermsEnum termsEnum = terms.iterator();
if (VERBOSE) {
System.out.println("TEST: got termsEnum=" + termsEnum);
}
BytesRef term;
int ord = 0;
Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null);
while ((term = termsEnum.next()) != null) {
BytesRef term2 = termsEnum2.next();
assertNotNull(term2);
assertEquals(term, term2);
assertEquals(termsEnum.docFreq(), termsEnum2.docFreq());
assertEquals(termsEnum.totalTermFreq(), termsEnum2.totalTermFreq());
if (ord == 0) {
try {
termsEnum.ord();
} catch (UnsupportedOperationException uoe) {
if (VERBOSE) {
System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
}
storeOrd = false;
}
}
final int output;
if (storeOrd) {
output = ord;
} else {
output = termsEnum.docFreq();
}
builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
ord++;
if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
System.out.println(ord + " terms...");
}
}
FST<Long> fst = builder.finish();
if (VERBOSE) {
System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
}
if (ord > 0) {
final Random random = new Random(random().nextLong());
// Now confirm BytesRefFSTEnum and TermsEnum act the
// same:
final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
int num = atLeast(1000);
for (int iter = 0; iter < num; iter++) {
final BytesRef randomTerm = new BytesRef(getRandomString(random));
if (VERBOSE) {
System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
}
final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
if (seekResult == TermsEnum.SeekStatus.END) {
assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
} else {
assertSame(termsEnum, fstEnum, storeOrd);
for (int nextIter = 0; nextIter < 10; nextIter++) {
if (VERBOSE) {
System.out.println("TEST: next");
if (storeOrd) {
System.out.println(" ord=" + termsEnum.ord());
}
}
if (termsEnum.next() != null) {
if (VERBOSE) {
System.out.println(" term=" + termsEnum.term().utf8ToString());
}
assertNotNull(fstEnum.next());
assertSame(termsEnum, fstEnum, storeOrd);
} else {
if (VERBOSE) {
System.out.println(" end!");
}
BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
if (nextResult != null) {
System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
fail();
}
break;
}
}
}
}
}
}
r.close();
dir.close();
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TermsIncludingScoreQuery method createWeight.
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
if (needsScores == false) {
// We don't need scores then quickly change the query:
TermsQuery termsQuery = new TermsQuery(toField, terms, fromField, fromQuery, topReaderContextId);
return searcher.rewrite(termsQuery).createWeight(searcher, false, boost);
}
return new Weight(TermsIncludingScoreQuery.this) {
@Override
public void extractTerms(Set<Term> terms) {
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
Terms terms = context.reader().terms(toField);
if (terms != null) {
TermsEnum segmentTermsEnum = terms.iterator();
BytesRef spare = new BytesRef();
PostingsEnum postingsEnum = null;
for (int i = 0; i < TermsIncludingScoreQuery.this.terms.size(); i++) {
if (segmentTermsEnum.seekExact(TermsIncludingScoreQuery.this.terms.get(ords[i], spare))) {
postingsEnum = segmentTermsEnum.postings(postingsEnum, PostingsEnum.NONE);
if (postingsEnum.advance(doc) == doc) {
final float score = TermsIncludingScoreQuery.this.scores[ords[i]];
return Explanation.match(score, "Score based on join value " + segmentTermsEnum.term().utf8ToString());
}
}
}
}
return Explanation.noMatch("Not a match");
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
Terms terms = context.reader().terms(toField);
if (terms == null) {
return null;
}
// what is the runtime...seems ok?
final long cost = context.reader().maxDoc() * terms.size();
TermsEnum segmentTermsEnum = terms.iterator();
if (multipleValuesPerDocument) {
return new MVInOrderScorer(this, segmentTermsEnum, context.reader().maxDoc(), cost);
} else {
return new SVInOrderScorer(this, segmentTermsEnum, context.reader().maxDoc(), cost);
}
}
};
}
Aggregations