Search in sources :

Example 21 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestRTGBase method getFirstMatch.

protected int getFirstMatch(IndexReader r, Term t) throws IOException {
    Terms terms = MultiFields.getTerms(r, t.field());
    if (terms == null)
        return -1;
    BytesRef termBytes = t.bytes();
    final TermsEnum termsEnum = terms.iterator();
    if (!termsEnum.seekExact(termBytes)) {
        return -1;
    }
    PostingsEnum docs = termsEnum.postings(null, PostingsEnum.NONE);
    docs = BitsFilteredPostingsEnum.wrap(docs, MultiFields.getLiveDocs(r));
    int id = docs.nextDoc();
    if (id != DocIdSetIterator.NO_MORE_DOCS) {
        int next = docs.nextDoc();
        assertEquals(DocIdSetIterator.NO_MORE_DOCS, next);
    }
    return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id;
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 22 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class DirectoryTaxonomyReader method getOrdinal.

@Override
public int getOrdinal(FacetLabel cp) throws IOException {
    ensureOpen();
    if (cp.length == 0) {
        return ROOT_ORDINAL;
    }
    // First try to find the answer in the LRU cache:
    synchronized (ordinalCache) {
        Integer res = ordinalCache.get(cp);
        if (res != null) {
            if (res.intValue() < indexReader.maxDoc()) {
                // this DTR instance recognizes.
                return res.intValue();
            } else {
                // it there too.
                return TaxonomyReader.INVALID_ORDINAL;
            }
        }
    }
    // If we're still here, we have a cache miss. We need to fetch the
    // value from disk, and then also put it in the cache:
    int ret = TaxonomyReader.INVALID_ORDINAL;
    PostingsEnum docs = MultiFields.getTermDocsEnum(indexReader, Consts.FULL, new BytesRef(FacetsConfig.pathToString(cp.components, cp.length)), 0);
    if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        ret = docs.docID();
        // generation of DTR that a category does not exist.
        synchronized (ordinalCache) {
            ordinalCache.put(cp, Integer.valueOf(ret));
        }
    }
    return ret;
}
Also used : PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Example 23 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TaxonomyIndexArrays method initParents.

// Read the parents of the new categories
private void initParents(IndexReader reader, int first) throws IOException {
    if (reader.maxDoc() == first) {
        return;
    }
    // it's ok to use MultiFields because we only iterate on one posting list.
    // breaking it to loop over the leaves() only complicates code for no
    // apparent gain.
    PostingsEnum positions = MultiFields.getTermPositionsEnum(reader, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, PostingsEnum.PAYLOADS);
    // shouldn't really happen, if it does, something's wrong
    if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
        throw new CorruptIndexException("Missing parent data for category " + first, reader.toString());
    }
    int num = reader.maxDoc();
    for (int i = first; i < num; i++) {
        if (positions.docID() == i) {
            if (positions.freq() == 0) {
                // shouldn't happen
                throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
            }
            parents[i] = positions.nextPosition();
            if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
                if (i + 1 < num) {
                    throw new CorruptIndexException("Missing parent data for category " + (i + 1), reader.toString());
                }
                break;
            }
        } else {
            // this shouldn't happen
            throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
        }
    }
}
Also used : CorruptIndexException(org.apache.lucene.index.CorruptIndexException) PostingsEnum(org.apache.lucene.index.PostingsEnum)

Example 24 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class DirectoryTaxonomyWriter method addTaxonomy.

/**
   * Takes the categories from the given taxonomy directory, and adds the
   * missing ones to this taxonomy. Additionally, it fills the given
   * {@link OrdinalMap} with a mapping from the original ordinal to the new
   * ordinal.
   */
public void addTaxonomy(Directory taxoDir, OrdinalMap map) throws IOException {
    ensureOpen();
    DirectoryReader r = DirectoryReader.open(taxoDir);
    try {
        final int size = r.numDocs();
        final OrdinalMap ordinalMap = map;
        ordinalMap.setSize(size);
        int base = 0;
        PostingsEnum docs = null;
        for (final LeafReaderContext ctx : r.leaves()) {
            final LeafReader ar = ctx.reader();
            final Terms terms = ar.terms(Consts.FULL);
            // TODO: share per-segment TermsEnum here!
            TermsEnum te = terms.iterator();
            while (te.next() != null) {
                FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(te.term().utf8ToString()));
                final int ordinal = addCategory(cp);
                docs = te.postings(docs, PostingsEnum.NONE);
                ordinalMap.addMapping(docs.nextDoc() + base, ordinal);
            }
            // no deletions, so we're ok
            base += ar.maxDoc();
        }
        ordinalMap.addDone();
    } finally {
        r.close();
    }
}
Also used : LeafReader(org.apache.lucene.index.LeafReader) DirectoryReader(org.apache.lucene.index.DirectoryReader) FacetLabel(org.apache.lucene.facet.taxonomy.FacetLabel) Terms(org.apache.lucene.index.Terms) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) PostingsEnum(org.apache.lucene.index.PostingsEnum) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 25 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class SolrIndexSplitter method split.

FixedBitSet[] split(LeafReaderContext readerContext) throws IOException {
    LeafReader reader = readerContext.reader();
    FixedBitSet[] docSets = new FixedBitSet[numPieces];
    for (int i = 0; i < docSets.length; i++) {
        docSets[i] = new FixedBitSet(reader.maxDoc());
    }
    Bits liveDocs = reader.getLiveDocs();
    Fields fields = reader.fields();
    Terms terms = fields == null ? null : fields.terms(field.getName());
    TermsEnum termsEnum = terms == null ? null : terms.iterator();
    if (termsEnum == null)
        return docSets;
    BytesRef term = null;
    PostingsEnum postingsEnum = null;
    int[] docsMatchingRanges = null;
    if (ranges != null) {
        // +1 because documents can belong to *zero*, one, several or all ranges in rangesArr
        docsMatchingRanges = new int[rangesArr.length + 1];
    }
    CharsRefBuilder idRef = new CharsRefBuilder();
    for (; ; ) {
        term = termsEnum.next();
        if (term == null)
            break;
        // figure out the hash for the term
        // FUTURE: if conversion to strings costs too much, we could
        // specialize and use the hash function that can work over bytes.
        field.getType().indexedToReadable(term, idRef);
        String idString = idRef.toString();
        if (splitKey != null) {
            // todo have composite routers support these kind of things instead
            String part1 = getRouteKey(idString);
            if (part1 == null)
                continue;
            if (!splitKey.equals(part1)) {
                continue;
            }
        }
        int hash = 0;
        if (hashRouter != null) {
            hash = hashRouter.sliceHash(idString, null, null, null);
        }
        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
        postingsEnum = BitsFilteredPostingsEnum.wrap(postingsEnum, liveDocs);
        for (; ; ) {
            int doc = postingsEnum.nextDoc();
            if (doc == DocIdSetIterator.NO_MORE_DOCS)
                break;
            if (ranges == null) {
                docSets[currPartition].set(doc);
                currPartition = (currPartition + 1) % numPieces;
            } else {
                int matchingRangesCount = 0;
                for (int i = 0; i < rangesArr.length; i++) {
                    // inner-loop: use array here for extra speed.
                    if (rangesArr[i].includes(hash)) {
                        docSets[i].set(doc);
                        ++matchingRangesCount;
                    }
                }
                docsMatchingRanges[matchingRangesCount]++;
            }
        }
    }
    if (docsMatchingRanges != null) {
        for (int ii = 0; ii < docsMatchingRanges.length; ii++) {
            if (0 == docsMatchingRanges[ii])
                continue;
            switch(ii) {
                case 0:
                    // document loss
                    log.error("Splitting {}: {} documents belong to no shards and will be dropped", reader, docsMatchingRanges[ii]);
                    break;
                case 1:
                    // normal case, each document moves to one of the sub-shards
                    log.info("Splitting {}: {} documents will move into a sub-shard", reader, docsMatchingRanges[ii]);
                    break;
                default:
                    // document duplication
                    log.error("Splitting {}: {} documents will be moved to multiple ({}) sub-shards", reader, docsMatchingRanges[ii], ii);
                    break;
            }
        }
    }
    return docSets;
}
Also used : LeafReader(org.apache.lucene.index.LeafReader) Terms(org.apache.lucene.index.Terms) TermsEnum(org.apache.lucene.index.TermsEnum) Fields(org.apache.lucene.index.Fields) FixedBitSet(org.apache.lucene.util.FixedBitSet) Bits(org.apache.lucene.util.Bits) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) BitsFilteredPostingsEnum(org.apache.solr.search.BitsFilteredPostingsEnum) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)73 BytesRef (org.apache.lucene.util.BytesRef)55 TermsEnum (org.apache.lucene.index.TermsEnum)50 Terms (org.apache.lucene.index.Terms)42 Fields (org.apache.lucene.index.Fields)18 LeafReader (org.apache.lucene.index.LeafReader)17 Term (org.apache.lucene.index.Term)16 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)15 Document (org.apache.lucene.document.Document)13 ArrayList (java.util.ArrayList)10 IndexReader (org.apache.lucene.index.IndexReader)10 TextField (org.apache.lucene.document.TextField)9 Directory (org.apache.lucene.store.Directory)9 Bits (org.apache.lucene.util.Bits)9 IOException (java.io.IOException)8 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)5