use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TFValueSource method getValues.
@Override
public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException {
Fields fields = readerContext.reader().fields();
final Terms terms = fields.terms(indexedField);
IndexSearcher searcher = (IndexSearcher) context.get("searcher");
final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(true), indexedField);
if (similarity == null) {
throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
}
return new FloatDocValues(this) {
PostingsEnum docs;
int atDoc;
int lastDocRequested = -1;
{
reset();
}
public void reset() throws IOException {
if (terms != null) {
final TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(indexedBytes)) {
docs = termsEnum.postings(null);
} else {
docs = null;
}
} else {
docs = null;
}
if (docs == null) {
docs = new PostingsEnum() {
@Override
public int freq() {
return 0;
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
return null;
}
@Override
public int docID() {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public int nextDoc() {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public int advance(int target) {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public long cost() {
return 0;
}
};
}
atDoc = -1;
}
@Override
public float floatVal(int doc) {
try {
if (doc < lastDocRequested) {
// out-of-order access.... reset
reset();
}
lastDocRequested = doc;
if (atDoc < doc) {
atDoc = docs.advance(doc);
}
if (atDoc > doc) {
// end, or because the next doc is after this doc.
return similarity.tf(0);
}
// a match!
return similarity.tf(docs.freq());
} catch (IOException e) {
throw new RuntimeException("caught exception in function " + description() + " : doc=" + doc, e);
}
}
};
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TermFreqValueSource method getValues.
@Override
public FunctionValues getValues(Map context, LeafReaderContext readerContext) throws IOException {
Fields fields = readerContext.reader().fields();
final Terms terms = fields.terms(indexedField);
return new IntDocValues(this) {
PostingsEnum docs;
int atDoc;
int lastDocRequested = -1;
{
reset();
}
public void reset() throws IOException {
if (terms != null) {
final TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(indexedBytes)) {
docs = termsEnum.postings(null);
} else {
docs = null;
}
} else {
docs = null;
}
if (docs == null) {
docs = new PostingsEnum() {
@Override
public int freq() {
return 0;
}
@Override
public int nextPosition() throws IOException {
return -1;
}
@Override
public int startOffset() throws IOException {
return -1;
}
@Override
public int endOffset() throws IOException {
return -1;
}
@Override
public BytesRef getPayload() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public int nextDoc() {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public int advance(int target) {
return DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public long cost() {
return 0;
}
};
}
atDoc = -1;
}
@Override
public int intVal(int doc) {
try {
if (doc < lastDocRequested) {
// out-of-order access.... reset
reset();
}
lastDocRequested = doc;
if (atDoc < doc) {
atDoc = docs.advance(doc);
}
if (atDoc > doc) {
// end, or because the next doc is after this doc.
return 0;
}
// a match!
return docs.freq();
} catch (IOException e) {
throw new RuntimeException("caught exception in function " + description() + " : doc=" + doc, e);
}
}
};
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class FieldOptions method mapOneVector.
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
NamedList<Object> fieldNL = new NamedList<>();
docNL.add(field, fieldNL);
BytesRef text;
PostingsEnum dpEnum = null;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
NamedList<Object> termInfo = new NamedList<>();
fieldNL.add(term, termInfo);
final int freq = (int) termsEnum.totalTermFreq();
if (fieldOptions.termFreq == true) {
termInfo.add("tf", freq);
}
int dpEnumFlags = 0;
dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
//payloads require offsets
dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
boolean atNextDoc = false;
if (dpEnum != null) {
dpEnum.nextDoc();
atNextDoc = true;
}
if (atNextDoc && dpEnumFlags != 0) {
NamedList<Integer> positionsNL = null;
NamedList<Number> theOffsets = null;
NamedList<String> thePayloads = null;
for (int i = 0; i < freq; i++) {
final int pos = dpEnum.nextPosition();
if (fieldOptions.positions && pos >= 0) {
if (positionsNL == null) {
positionsNL = new NamedList<>();
termInfo.add("positions", positionsNL);
}
positionsNL.add("position", pos);
}
int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
if (startOffset >= 0) {
if (theOffsets == null) {
theOffsets = new NamedList<>();
termInfo.add("offsets", theOffsets);
}
theOffsets.add("start", dpEnum.startOffset());
theOffsets.add("end", dpEnum.endOffset());
}
BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
if (payload != null) {
if (thePayloads == null) {
thePayloads = new NamedList<>();
termInfo.add("payloads", thePayloads);
}
thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
}
}
}
int df = 0;
if (fieldOptions.docFreq || fieldOptions.tfIdf) {
df = reader.docFreq(new Term(field, text));
}
if (fieldOptions.docFreq) {
termInfo.add("df", df);
}
// TODO: this is not TF/IDF by anyone's definition!
if (fieldOptions.tfIdf) {
double tfIdfVal = ((double) freq) / df;
termInfo.add("tf-idf", tfIdfVal);
}
}
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class SimpleFacets method getFacetTermEnumCounts.
/**
* Returns a list of terms in the specified field along with the
* corresponding count of documents in the set that match that constraint.
* This method uses the FilterCache to get the intersection count between <code>docs</code>
* and the DocSet for each term in the filter.
*
* @see FacetParams#FACET_LIMIT
* @see FacetParams#FACET_ZEROS
* @see FacetParams#FACET_MISSING
*/
public NamedList<Integer> getFacetTermEnumCounts(SolrIndexSearcher searcher, DocSet docs, String field, int offset, int limit, int mincount, boolean missing, String sort, String prefix, Predicate<BytesRef> termFilter, boolean intersectsCheck) throws IOException {
/* :TODO: potential optimization...
* cache the Terms with the highest docFreq and try them first
* don't enum if we get our max from them
*/
// Minimum term docFreq in order to use the filterCache for that term.
int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);
// make sure we have a set that is fast for random access, if we will use it for that
DocSet fastForRandomSet = docs;
if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
SortedIntDocSet sset = (SortedIntDocSet) docs;
fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
}
IndexSchema schema = searcher.getSchema();
FieldType ft = schema.getFieldType(field);
assert !ft.isPointField() : "Point Fields don't support enum method";
LeafReader r = searcher.getSlowAtomicReader();
boolean sortByCount = sort.equals("count") || sort.equals("true");
final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
final BoundedTreeSet<CountPair<BytesRef, Integer>> queue = sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
final NamedList<Integer> res = new NamedList<>();
// the smallest value in the top 'N' values
int min = mincount - 1;
int off = offset;
int lim = limit >= 0 ? limit : Integer.MAX_VALUE;
BytesRef prefixTermBytes = null;
if (prefix != null) {
String indexedPrefix = ft.toInternal(prefix);
prefixTermBytes = new BytesRef(indexedPrefix);
}
Fields fields = r.fields();
Terms terms = fields == null ? null : fields.terms(field);
TermsEnum termsEnum = null;
SolrIndexSearcher.DocsEnumState deState = null;
BytesRef term = null;
if (terms != null) {
termsEnum = terms.iterator();
if (prefixTermBytes != null) {
if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
termsEnum = null;
} else {
term = termsEnum.term();
}
} else {
// position termsEnum on first term
term = termsEnum.next();
}
}
PostingsEnum postingsEnum = null;
CharsRefBuilder charsRef = new CharsRefBuilder();
if (docs.size() >= mincount) {
while (term != null) {
if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes))
break;
if (termFilter == null || termFilter.test(term)) {
int df = termsEnum.docFreq();
// make a large difference (for example, many terms with df=1).
if (df > 0 && df > min) {
int c;
if (df >= minDfFilterCache) {
if (deState == null) {
deState = new SolrIndexSearcher.DocsEnumState();
deState.fieldName = field;
deState.liveDocs = r.getLiveDocs();
deState.termsEnum = termsEnum;
deState.postingsEnum = postingsEnum;
}
if (intersectsCheck) {
c = searcher.intersects(docs, deState) ? 1 : 0;
} else {
c = searcher.numDocs(docs, deState);
}
postingsEnum = deState.postingsEnum;
} else {
// iterate over TermDocs to calculate the intersection
// TODO: specialize when base docset is a bitset or hash set (skipDocs)? or does it matter for this?
// TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class impl)
// TODO: would passing deleted docs lead to better efficiency over checking the fastForRandomSet?
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
c = 0;
if (postingsEnum instanceof MultiPostingsEnum) {
MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
SEGMENTS_LOOP: for (int subindex = 0; subindex < numSubs; subindex++) {
MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
if (sub.postingsEnum == null)
continue;
int base = sub.slice.start;
int docid;
while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.exists(docid + base)) {
c++;
if (intersectsCheck) {
assert c == 1;
break SEGMENTS_LOOP;
}
}
}
}
} else {
int docid;
while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
if (fastForRandomSet.exists(docid)) {
c++;
if (intersectsCheck) {
assert c == 1;
break;
}
}
}
}
}
if (sortByCount) {
if (c > min) {
BytesRef termCopy = BytesRef.deepCopyOf(term);
queue.add(new CountPair<>(termCopy, c));
if (queue.size() >= maxsize)
min = queue.last().val;
}
} else {
if (c >= mincount && --off < 0) {
if (--lim < 0)
break;
ft.indexedToReadable(term, charsRef);
res.add(charsRef.toString(), c);
}
}
}
}
term = termsEnum.next();
}
}
if (sortByCount) {
for (CountPair<BytesRef, Integer> p : queue) {
if (--off >= 0)
continue;
if (--lim < 0)
break;
ft.indexedToReadable(p.key, charsRef);
res.add(charsRef.toString(), p.val);
}
}
if (missing) {
res.add(null, getFieldMissingCount(searcher, docs, field));
}
return res;
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class SolrIndexSearcher method getDocSet.
/** @lucene.internal */
public DocSet getDocSet(DocsEnumState deState) throws IOException {
int largestPossible = deState.termsEnum.docFreq();
boolean useCache = filterCache != null && largestPossible >= deState.minSetSizeCached;
TermQuery key = null;
if (useCache) {
key = new TermQuery(new Term(deState.fieldName, deState.termsEnum.term()));
DocSet result = filterCache.get(key);
if (result != null)
return result;
}
int smallSetSize = DocSetUtil.smallSetSize(maxDoc());
int scratchSize = Math.min(smallSetSize, largestPossible);
if (deState.scratch == null || deState.scratch.length < scratchSize)
deState.scratch = new int[scratchSize];
final int[] docs = deState.scratch;
int upto = 0;
int bitsSet = 0;
FixedBitSet fbs = null;
PostingsEnum postingsEnum = deState.termsEnum.postings(deState.postingsEnum, PostingsEnum.NONE);
postingsEnum = BitsFilteredPostingsEnum.wrap(postingsEnum, deState.liveDocs);
if (deState.postingsEnum == null) {
deState.postingsEnum = postingsEnum;
}
if (postingsEnum instanceof MultiPostingsEnum) {
MultiPostingsEnum.EnumWithSlice[] subs = ((MultiPostingsEnum) postingsEnum).getSubs();
int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
for (int subindex = 0; subindex < numSubs; subindex++) {
MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
if (sub.postingsEnum == null)
continue;
int base = sub.slice.start;
int docid;
if (largestPossible > docs.length) {
if (fbs == null)
fbs = new FixedBitSet(maxDoc());
while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
fbs.set(docid + base);
bitsSet++;
}
} else {
while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
docs[upto++] = docid + base;
}
}
}
} else {
int docid;
if (largestPossible > docs.length) {
fbs = new FixedBitSet(maxDoc());
while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
fbs.set(docid);
bitsSet++;
}
} else {
while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
docs[upto++] = docid;
}
}
}
DocSet result;
if (fbs != null) {
for (int i = 0; i < upto; i++) {
fbs.set(docs[i]);
}
bitsSet += upto;
result = new BitDocSet(fbs, bitsSet);
} else {
result = upto == 0 ? DocSet.EMPTY : new SortedIntDocSet(Arrays.copyOf(docs, upto));
}
if (useCache) {
filterCache.put(key, result);
}
return result;
}
Aggregations