use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TestRTGBase method getFirstMatch.
protected int getFirstMatch(IndexReader r, Term t) throws IOException {
Terms terms = MultiFields.getTerms(r, t.field());
if (terms == null)
return -1;
BytesRef termBytes = t.bytes();
final TermsEnum termsEnum = terms.iterator();
if (!termsEnum.seekExact(termBytes)) {
return -1;
}
PostingsEnum docs = termsEnum.postings(null, PostingsEnum.NONE);
docs = BitsFilteredPostingsEnum.wrap(docs, MultiFields.getLiveDocs(r));
int id = docs.nextDoc();
if (id != DocIdSetIterator.NO_MORE_DOCS) {
int next = docs.nextDoc();
assertEquals(DocIdSetIterator.NO_MORE_DOCS, next);
}
return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id;
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class DirectoryTaxonomyReader method getOrdinal.
@Override
public int getOrdinal(FacetLabel cp) throws IOException {
ensureOpen();
if (cp.length == 0) {
return ROOT_ORDINAL;
}
// First try to find the answer in the LRU cache:
synchronized (ordinalCache) {
Integer res = ordinalCache.get(cp);
if (res != null) {
if (res.intValue() < indexReader.maxDoc()) {
// this DTR instance recognizes.
return res.intValue();
} else {
// it there too.
return TaxonomyReader.INVALID_ORDINAL;
}
}
}
// If we're still here, we have a cache miss. We need to fetch the
// value from disk, and then also put it in the cache:
int ret = TaxonomyReader.INVALID_ORDINAL;
PostingsEnum docs = MultiFields.getTermDocsEnum(indexReader, Consts.FULL, new BytesRef(FacetsConfig.pathToString(cp.components, cp.length)), 0);
if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
ret = docs.docID();
// generation of DTR that a category does not exist.
synchronized (ordinalCache) {
ordinalCache.put(cp, Integer.valueOf(ret));
}
}
return ret;
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class TaxonomyIndexArrays method initParents.
// Read the parents of the new categories
private void initParents(IndexReader reader, int first) throws IOException {
if (reader.maxDoc() == first) {
return;
}
// it's ok to use MultiFields because we only iterate on one posting list.
// breaking it to loop over the leaves() only complicates code for no
// apparent gain.
PostingsEnum positions = MultiFields.getTermPositionsEnum(reader, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, PostingsEnum.PAYLOADS);
// shouldn't really happen, if it does, something's wrong
if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
throw new CorruptIndexException("Missing parent data for category " + first, reader.toString());
}
int num = reader.maxDoc();
for (int i = first; i < num; i++) {
if (positions.docID() == i) {
if (positions.freq() == 0) {
// shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
}
parents[i] = positions.nextPosition();
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
if (i + 1 < num) {
throw new CorruptIndexException("Missing parent data for category " + (i + 1), reader.toString());
}
break;
}
} else {
// this shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
}
}
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class DirectoryTaxonomyWriter method addTaxonomy.
/**
* Takes the categories from the given taxonomy directory, and adds the
* missing ones to this taxonomy. Additionally, it fills the given
* {@link OrdinalMap} with a mapping from the original ordinal to the new
* ordinal.
*/
public void addTaxonomy(Directory taxoDir, OrdinalMap map) throws IOException {
ensureOpen();
DirectoryReader r = DirectoryReader.open(taxoDir);
try {
final int size = r.numDocs();
final OrdinalMap ordinalMap = map;
ordinalMap.setSize(size);
int base = 0;
PostingsEnum docs = null;
for (final LeafReaderContext ctx : r.leaves()) {
final LeafReader ar = ctx.reader();
final Terms terms = ar.terms(Consts.FULL);
// TODO: share per-segment TermsEnum here!
TermsEnum te = terms.iterator();
while (te.next() != null) {
FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(te.term().utf8ToString()));
final int ordinal = addCategory(cp);
docs = te.postings(docs, PostingsEnum.NONE);
ordinalMap.addMapping(docs.nextDoc() + base, ordinal);
}
// no deletions, so we're ok
base += ar.maxDoc();
}
ordinalMap.addDone();
} finally {
r.close();
}
}
use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.
the class SolrIndexSplitter method split.
FixedBitSet[] split(LeafReaderContext readerContext) throws IOException {
LeafReader reader = readerContext.reader();
FixedBitSet[] docSets = new FixedBitSet[numPieces];
for (int i = 0; i < docSets.length; i++) {
docSets[i] = new FixedBitSet(reader.maxDoc());
}
Bits liveDocs = reader.getLiveDocs();
Fields fields = reader.fields();
Terms terms = fields == null ? null : fields.terms(field.getName());
TermsEnum termsEnum = terms == null ? null : terms.iterator();
if (termsEnum == null)
return docSets;
BytesRef term = null;
PostingsEnum postingsEnum = null;
int[] docsMatchingRanges = null;
if (ranges != null) {
// +1 because documents can belong to *zero*, one, several or all ranges in rangesArr
docsMatchingRanges = new int[rangesArr.length + 1];
}
CharsRefBuilder idRef = new CharsRefBuilder();
for (; ; ) {
term = termsEnum.next();
if (term == null)
break;
// figure out the hash for the term
// FUTURE: if conversion to strings costs too much, we could
// specialize and use the hash function that can work over bytes.
field.getType().indexedToReadable(term, idRef);
String idString = idRef.toString();
if (splitKey != null) {
// todo have composite routers support these kind of things instead
String part1 = getRouteKey(idString);
if (part1 == null)
continue;
if (!splitKey.equals(part1)) {
continue;
}
}
int hash = 0;
if (hashRouter != null) {
hash = hashRouter.sliceHash(idString, null, null, null);
}
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
postingsEnum = BitsFilteredPostingsEnum.wrap(postingsEnum, liveDocs);
for (; ; ) {
int doc = postingsEnum.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS)
break;
if (ranges == null) {
docSets[currPartition].set(doc);
currPartition = (currPartition + 1) % numPieces;
} else {
int matchingRangesCount = 0;
for (int i = 0; i < rangesArr.length; i++) {
// inner-loop: use array here for extra speed.
if (rangesArr[i].includes(hash)) {
docSets[i].set(doc);
++matchingRangesCount;
}
}
docsMatchingRanges[matchingRangesCount]++;
}
}
}
if (docsMatchingRanges != null) {
for (int ii = 0; ii < docsMatchingRanges.length; ii++) {
if (0 == docsMatchingRanges[ii])
continue;
switch(ii) {
case 0:
// document loss
log.error("Splitting {}: {} documents belong to no shards and will be dropped", reader, docsMatchingRanges[ii]);
break;
case 1:
// normal case, each document moves to one of the sub-shards
log.info("Splitting {}: {} documents will move into a sub-shard", reader, docsMatchingRanges[ii]);
break;
default:
// document duplication
log.error("Splitting {}: {} documents will be moved to multiple ({}) sub-shards", reader, docsMatchingRanges[ii], ii);
break;
}
}
}
return docSets;
}
Aggregations