Search in sources :

Example 81 with FixedBitSet

use of org.apache.lucene.util.FixedBitSet in project greplin-lucene-utils by Cue.

the class FixedBitSets method all.

/**
 * Creates a bitset that includes all docs from the given reader.
 * @param reader the index
 * @return a bitset that includes all docs
 */
public static FixedBitSet all(final IndexReader reader) {
    FixedBitSet result = new FixedBitSet(reader.maxDoc());
    result.set(0, reader.maxDoc());
    return result;
}
Also used : FixedBitSet(org.apache.lucene.util.FixedBitSet)

Example 82 with FixedBitSet

use of org.apache.lucene.util.FixedBitSet in project greplin-lucene-utils by Cue.

the class PhraseFilter method getDocIdSet.

@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
    List<IndexReader> subReaders = IndexReaders.gatherSubReaders(reader);
    PhraseFilterMatchList[] results = new PhraseFilterMatchList[subReaders.size()];
    int matchCount = 0;
    int readerNumber = 0;
    for (IndexReader subReader : subReaders) {
        SortedSet<TermWithFrequency> termsOrderedByFrequency = Sets.newTreeSet();
        for (int i = 0; i < this.terms.length; i++) {
            Term t = this.terms[i];
            termsOrderedByFrequency.add(new TermWithFrequency(t, subReader.docFreq(t), i));
        }
        PhraseFilterMatchList matches = null;
        TermPositions termPositions = subReader.termPositions();
        try {
            for (TermWithFrequency term : termsOrderedByFrequency) {
                if (term.docFreq == 0) {
                    break;
                }
                termPositions.seek(term.term);
                if (matches == null) {
                    // If this is the first term, collect all matches that intersect
                    // with the provided initial document set.
                    Intersection intersection = this.intersectionProvider.get(reader);
                    matches = new PhraseFilterMatchList(term.docFreq);
                    while (intersection.advanceToNextIntersection(termPositions)) {
                        int freq = termPositions.freq();
                        PhraseFilterIntList list = new PhraseFilterIntList(freq);
                        for (int i = 0; i < freq; i++) {
                            list.add(termPositions.nextPosition() - term.offset);
                        }
                        matches.add(termPositions.doc(), list);
                    }
                } else {
                    // Otherwise, intersect with the existing matches.
                    matches.intersect(termPositions, term.offset);
                }
                if (matches.getCount() == 0) {
                    break;
                }
            }
        } finally {
            termPositions.close();
        }
        if (matches != null) {
            results[readerNumber] = matches;
            matchCount += matches.getCount();
        }
        readerNumber++;
    }
    // 2^5 = 32
    final int bitsPerIntPowerLogTwo = 5;
    if (matchCount > reader.maxDoc() >> bitsPerIntPowerLogTwo) {
        FixedBitSet result = new FixedBitSet(reader.maxDoc());
        int readerOffset = 0;
        for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
            PhraseFilterMatchList matches = results[readerIndex];
            if (matches != null) {
                int count = matches.getCount();
                int[] docIds = matches.getDocIds();
                for (int i = 0; i < count; i++) {
                    result.set(docIds[i] + readerOffset);
                }
            }
            readerOffset += subReaders.get(readerIndex).maxDoc();
        }
        return result;
    } else if (matchCount == 0) {
        return DocIdSets.EMPTY;
    } else {
        int[] result = new int[matchCount];
        int base = 0;
        int readerOffset = 0;
        for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
            PhraseFilterMatchList matches = results[readerIndex];
            if (matches != null) {
                int count = matches.getCount();
                int[] docIds = matches.getDocIds();
                for (int i = 0; i < count; i++) {
                    result[base + i] = docIds[i] + readerOffset;
                }
                base += count;
            }
            readerOffset += subReaders.get(readerIndex).maxDoc();
        }
        return new SortedIntArrayDocIdSet(result);
    }
}
Also used : Intersection(com.greplin.lucene.util.Intersection) Term(org.apache.lucene.index.Term) FixedBitSet(org.apache.lucene.util.FixedBitSet) IndexReader(org.apache.lucene.index.IndexReader) TermPositions(org.apache.lucene.index.TermPositions)

Example 83 with FixedBitSet

use of org.apache.lucene.util.FixedBitSet in project crate by crate.

the class LinearizabilityChecker method isLinearizable.

private boolean isLinearizable(SequentialSpec spec, List<Event> history, BooleanSupplier terminateEarly) {
    LOGGER.debug("Checking history of size: {}: {}", history.size(), history);
    // the current state of the datatype
    Object state = spec.initialState();
    // the linearized prefix of the history
    final FixedBitSet linearized = new FixedBitSet(history.size() / 2);
    // cache of explored <state, linearized prefix> pairs
    final Cache cache = new Cache();
    // path we're currently exploring
    final Deque<Tuple<Entry, Object>> calls = new LinkedList<>();
    final Entry headEntry = createLinkedEntries(history);
    // current entry
    Entry entry = headEntry.next;
    while (headEntry.next != null) {
        if (terminateEarly.getAsBoolean()) {
            return false;
        }
        if (entry.match != null) {
            final Optional<Object> maybeNextState = spec.nextState(state, entry.event.value, entry.match.event.value);
            boolean shouldExploreNextState = false;
            if (maybeNextState.isPresent()) {
                // check if we have already explored this linearization
                final FixedBitSet updatedLinearized = linearized.clone();
                updatedLinearized.set(entry.id);
                shouldExploreNextState = cache.add(maybeNextState.get(), updatedLinearized);
            }
            if (shouldExploreNextState) {
                calls.push(new Tuple<>(entry, state));
                state = maybeNextState.get();
                linearized.set(entry.id);
                entry.lift();
                entry = headEntry.next;
            } else {
                entry = entry.next;
            }
        } else {
            if (calls.isEmpty()) {
                return false;
            }
            final Tuple<Entry, Object> top = calls.pop();
            entry = top.v1();
            state = top.v2();
            linearized.clear(entry.id);
            entry.unlift();
            entry = entry.next;
        }
    }
    return true;
}
Also used : FixedBitSet(org.apache.lucene.util.FixedBitSet) Tuple(io.crate.common.collections.Tuple) LinkedList(java.util.LinkedList)

Example 84 with FixedBitSet

use of org.apache.lucene.util.FixedBitSet in project crate by crate.

the class InternalEngineTests method testConcurrentWritesAndCommits.

// this test writes documents to the engine while concurrently flushing/commit
// and ensuring that the commit points contain the correct sequence number data
@Test
public void testConcurrentWritesAndCommits() throws Exception {
    List<Engine.IndexCommitRef> commits = new ArrayList<>();
    try (Store store = createStore();
        InternalEngine engine = createEngine(config(defaultSettings, store, createTempDir(), newMergePolicy(), null))) {
        final int numIndexingThreads = scaledRandomIntBetween(2, 4);
        final int numDocsPerThread = randomIntBetween(500, 1000);
        final CyclicBarrier barrier = new CyclicBarrier(numIndexingThreads + 1);
        final List<Thread> indexingThreads = new ArrayList<>();
        final CountDownLatch doneLatch = new CountDownLatch(numIndexingThreads);
        // create N indexing threads to index documents simultaneously
        for (int threadNum = 0; threadNum < numIndexingThreads; threadNum++) {
            final int threadIdx = threadNum;
            Thread indexingThread = new Thread(() -> {
                try {
                    // wait for all threads to start at the same time
                    barrier.await();
                    // index random number of docs
                    for (int i = 0; i < numDocsPerThread; i++) {
                        final String id = "thread" + threadIdx + "#" + i;
                        ParsedDocument doc = testParsedDocument(id, null, testDocument(), B_1, null);
                        engine.index(indexForDoc(doc));
                    }
                } catch (Exception e) {
                    throw new RuntimeException(e);
                } finally {
                    doneLatch.countDown();
                }
            });
            indexingThreads.add(indexingThread);
        }
        // start the indexing threads
        for (Thread thread : indexingThreads) {
            thread.start();
        }
        // wait for indexing threads to all be ready to start
        barrier.await();
        int commitLimit = randomIntBetween(10, 20);
        long sleepTime = 1;
        // create random commit points
        boolean doneIndexing;
        do {
            doneIndexing = doneLatch.await(sleepTime, TimeUnit.MILLISECONDS);
            commits.add(engine.acquireLastIndexCommit(true));
            if (commits.size() > commitLimit) {
                // don't keep on piling up too many commits
                IOUtils.close(commits.remove(randomIntBetween(0, commits.size() - 1)));
                // we increase the wait time to make sure we eventually if things are slow wait for threads to finish.
                // this will reduce pressure on disks and will allow threads to make progress without piling up too many commits
                sleepTime = sleepTime * 2;
            }
        } while (doneIndexing == false);
        // now, verify all the commits have the correct docs according to the user commit data
        long prevLocalCheckpoint = SequenceNumbers.NO_OPS_PERFORMED;
        long prevMaxSeqNo = SequenceNumbers.NO_OPS_PERFORMED;
        for (Engine.IndexCommitRef commitRef : commits) {
            final IndexCommit commit = commitRef.getIndexCommit();
            Map<String, String> userData = commit.getUserData();
            long localCheckpoint = userData.containsKey(SequenceNumbers.LOCAL_CHECKPOINT_KEY) ? Long.parseLong(userData.get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)) : SequenceNumbers.NO_OPS_PERFORMED;
            long maxSeqNo = userData.containsKey(SequenceNumbers.MAX_SEQ_NO) ? Long.parseLong(userData.get(SequenceNumbers.MAX_SEQ_NO)) : UNASSIGNED_SEQ_NO;
            // local checkpoint and max seq no shouldn't go backwards
            assertThat(localCheckpoint, greaterThanOrEqualTo(prevLocalCheckpoint));
            assertThat(maxSeqNo, greaterThanOrEqualTo(prevMaxSeqNo));
            try (IndexReader reader = DirectoryReader.open(commit)) {
                Long highest = getHighestSeqNo(reader);
                final long highestSeqNo;
                if (highest != null) {
                    highestSeqNo = highest.longValue();
                } else {
                    highestSeqNo = SequenceNumbers.NO_OPS_PERFORMED;
                }
                // make sure localCheckpoint <= highest seq no found <= maxSeqNo
                assertThat(highestSeqNo, greaterThanOrEqualTo(localCheckpoint));
                assertThat(highestSeqNo, lessThanOrEqualTo(maxSeqNo));
                // make sure all sequence numbers up to and including the local checkpoint are in the index
                FixedBitSet seqNosBitSet = getSeqNosSet(reader, highestSeqNo);
                for (int i = 0; i <= localCheckpoint; i++) {
                    assertTrue("local checkpoint [" + localCheckpoint + "], _seq_no [" + i + "] should be indexed", seqNosBitSet.get(i));
                }
            }
            prevLocalCheckpoint = localCheckpoint;
            prevMaxSeqNo = maxSeqNo;
        }
    }
}
Also used : ArrayList(java.util.ArrayList) Store(org.elasticsearch.index.store.Store) Matchers.containsString(org.hamcrest.Matchers.containsString) CountDownLatch(java.util.concurrent.CountDownLatch) LongPoint(org.apache.lucene.document.LongPoint) AlreadyClosedException(org.apache.lucene.store.AlreadyClosedException) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) BrokenBarrierException(java.util.concurrent.BrokenBarrierException) ElasticsearchException(org.elasticsearch.ElasticsearchException) IndexCommit(org.apache.lucene.index.IndexCommit) CyclicBarrier(java.util.concurrent.CyclicBarrier) ParsedDocument(org.elasticsearch.index.mapper.ParsedDocument) FixedBitSet(org.apache.lucene.util.FixedBitSet) IndexReader(org.apache.lucene.index.IndexReader) AtomicLong(java.util.concurrent.atomic.AtomicLong) Test(org.junit.Test)

Example 85 with FixedBitSet

use of org.apache.lucene.util.FixedBitSet in project OpenGrok by OpenGrok.

the class CustomSloppyPhraseScorer method gatherRptGroups.

/**
 * Detect repetition groups. Done once - for first doc
 */
private ArrayList<ArrayList<PhrasePositions>> gatherRptGroups(LinkedHashMap<Term, Integer> rptTerms) throws IOException {
    PhrasePositions[] rpp = repeatingPPs(rptTerms);
    ArrayList<ArrayList<PhrasePositions>> res = new ArrayList<>();
    if (!hasMultiTermRpts) {
        // simpler - no multi-terms - can base on positions in first doc
        for (int i = 0; i < rpp.length; i++) {
            PhrasePositions pp = rpp[i];
            if (pp.rptGroup >= 0) {
                // already marked as a repetition
                continue;
            }
            int tpPos = tpPos(pp);
            for (int j = i + 1; j < rpp.length; j++) {
                PhrasePositions pp2 = rpp[j];
                if (// already marked as a repetition
                pp2.rptGroup >= 0 || // not a repetition: two PPs are originally in same offset in the query!
                pp2.offset == pp.offset || tpPos(pp2) != tpPos) {
                    // not a repetition
                    continue;
                }
                // a repetition
                int g = pp.rptGroup;
                if (g < 0) {
                    g = res.size();
                    pp.rptGroup = g;
                    ArrayList<PhrasePositions> rl = new ArrayList<>(2);
                    rl.add(pp);
                    res.add(rl);
                }
                pp2.rptGroup = g;
                res.get(g).add(pp2);
            }
        }
    } else {
        // more involved - has multi-terms
        ArrayList<HashSet<PhrasePositions>> tmp = new ArrayList<>();
        ArrayList<FixedBitSet> bb = ppTermsBitSets(rpp, rptTerms);
        unionTermGroups(bb);
        HashMap<Term, Integer> tg = termGroups(rptTerms, bb);
        HashSet<Integer> distinctGroupIDs = new HashSet<>(tg.values());
        for (int i = 0; i < distinctGroupIDs.size(); i++) {
            tmp.add(new HashSet<>());
        }
        for (PhrasePositions pp : rpp) {
            for (Term t : pp.terms) {
                if (rptTerms.containsKey(t)) {
                    int g = tg.get(t);
                    tmp.get(g).add(pp);
                    assert pp.rptGroup == -1 || pp.rptGroup == g;
                    pp.rptGroup = g;
                }
            }
        }
        for (HashSet<PhrasePositions> hs : tmp) {
            res.add(new ArrayList<>(hs));
        }
    }
    return res;
}
Also used : ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) FixedBitSet(org.apache.lucene.util.FixedBitSet) HashSet(java.util.HashSet)

Aggregations

FixedBitSet (org.apache.lucene.util.FixedBitSet)162 Term (org.apache.lucene.index.Term)27 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)26 Directory (org.apache.lucene.store.Directory)25 BytesRef (org.apache.lucene.util.BytesRef)22 IOException (java.io.IOException)19 Document (org.apache.lucene.document.Document)17 ArrayList (java.util.ArrayList)15 Query (org.apache.lucene.search.Query)15 NumericDocValues (org.apache.lucene.index.NumericDocValues)14 BitDocIdSet (org.apache.lucene.util.BitDocIdSet)13 Bits (org.apache.lucene.util.Bits)13 LeafReader (org.apache.lucene.index.LeafReader)12 IndexSearcher (org.apache.lucene.search.IndexSearcher)12 TermQuery (org.apache.lucene.search.TermQuery)12 IndexReader (org.apache.lucene.index.IndexReader)11 HashSet (java.util.HashSet)10 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)10 DocIterator (org.apache.solr.search.DocIterator)10 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)9