use of org.apache.lucene.util.FixedBitSet in project greplin-lucene-utils by Cue.
the class FixedBitSets method all.
/**
* Creates a bitset that includes all docs from the given reader.
* @param reader the index
* @return a bitset that includes all docs
*/
public static FixedBitSet all(final IndexReader reader) {
FixedBitSet result = new FixedBitSet(reader.maxDoc());
result.set(0, reader.maxDoc());
return result;
}
use of org.apache.lucene.util.FixedBitSet in project greplin-lucene-utils by Cue.
the class PhraseFilter method getDocIdSet.
@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
List<IndexReader> subReaders = IndexReaders.gatherSubReaders(reader);
PhraseFilterMatchList[] results = new PhraseFilterMatchList[subReaders.size()];
int matchCount = 0;
int readerNumber = 0;
for (IndexReader subReader : subReaders) {
SortedSet<TermWithFrequency> termsOrderedByFrequency = Sets.newTreeSet();
for (int i = 0; i < this.terms.length; i++) {
Term t = this.terms[i];
termsOrderedByFrequency.add(new TermWithFrequency(t, subReader.docFreq(t), i));
}
PhraseFilterMatchList matches = null;
TermPositions termPositions = subReader.termPositions();
try {
for (TermWithFrequency term : termsOrderedByFrequency) {
if (term.docFreq == 0) {
break;
}
termPositions.seek(term.term);
if (matches == null) {
// If this is the first term, collect all matches that intersect
// with the provided initial document set.
Intersection intersection = this.intersectionProvider.get(reader);
matches = new PhraseFilterMatchList(term.docFreq);
while (intersection.advanceToNextIntersection(termPositions)) {
int freq = termPositions.freq();
PhraseFilterIntList list = new PhraseFilterIntList(freq);
for (int i = 0; i < freq; i++) {
list.add(termPositions.nextPosition() - term.offset);
}
matches.add(termPositions.doc(), list);
}
} else {
// Otherwise, intersect with the existing matches.
matches.intersect(termPositions, term.offset);
}
if (matches.getCount() == 0) {
break;
}
}
} finally {
termPositions.close();
}
if (matches != null) {
results[readerNumber] = matches;
matchCount += matches.getCount();
}
readerNumber++;
}
// 2^5 = 32
final int bitsPerIntPowerLogTwo = 5;
if (matchCount > reader.maxDoc() >> bitsPerIntPowerLogTwo) {
FixedBitSet result = new FixedBitSet(reader.maxDoc());
int readerOffset = 0;
for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
PhraseFilterMatchList matches = results[readerIndex];
if (matches != null) {
int count = matches.getCount();
int[] docIds = matches.getDocIds();
for (int i = 0; i < count; i++) {
result.set(docIds[i] + readerOffset);
}
}
readerOffset += subReaders.get(readerIndex).maxDoc();
}
return result;
} else if (matchCount == 0) {
return DocIdSets.EMPTY;
} else {
int[] result = new int[matchCount];
int base = 0;
int readerOffset = 0;
for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
PhraseFilterMatchList matches = results[readerIndex];
if (matches != null) {
int count = matches.getCount();
int[] docIds = matches.getDocIds();
for (int i = 0; i < count; i++) {
result[base + i] = docIds[i] + readerOffset;
}
base += count;
}
readerOffset += subReaders.get(readerIndex).maxDoc();
}
return new SortedIntArrayDocIdSet(result);
}
}
use of org.apache.lucene.util.FixedBitSet in project crate by crate.
the class LinearizabilityChecker method isLinearizable.
private boolean isLinearizable(SequentialSpec spec, List<Event> history, BooleanSupplier terminateEarly) {
LOGGER.debug("Checking history of size: {}: {}", history.size(), history);
// the current state of the datatype
Object state = spec.initialState();
// the linearized prefix of the history
final FixedBitSet linearized = new FixedBitSet(history.size() / 2);
// cache of explored <state, linearized prefix> pairs
final Cache cache = new Cache();
// path we're currently exploring
final Deque<Tuple<Entry, Object>> calls = new LinkedList<>();
final Entry headEntry = createLinkedEntries(history);
// current entry
Entry entry = headEntry.next;
while (headEntry.next != null) {
if (terminateEarly.getAsBoolean()) {
return false;
}
if (entry.match != null) {
final Optional<Object> maybeNextState = spec.nextState(state, entry.event.value, entry.match.event.value);
boolean shouldExploreNextState = false;
if (maybeNextState.isPresent()) {
// check if we have already explored this linearization
final FixedBitSet updatedLinearized = linearized.clone();
updatedLinearized.set(entry.id);
shouldExploreNextState = cache.add(maybeNextState.get(), updatedLinearized);
}
if (shouldExploreNextState) {
calls.push(new Tuple<>(entry, state));
state = maybeNextState.get();
linearized.set(entry.id);
entry.lift();
entry = headEntry.next;
} else {
entry = entry.next;
}
} else {
if (calls.isEmpty()) {
return false;
}
final Tuple<Entry, Object> top = calls.pop();
entry = top.v1();
state = top.v2();
linearized.clear(entry.id);
entry.unlift();
entry = entry.next;
}
}
return true;
}
use of org.apache.lucene.util.FixedBitSet in project crate by crate.
the class InternalEngineTests method testConcurrentWritesAndCommits.
// this test writes documents to the engine while concurrently flushing/commit
// and ensuring that the commit points contain the correct sequence number data
@Test
public void testConcurrentWritesAndCommits() throws Exception {
List<Engine.IndexCommitRef> commits = new ArrayList<>();
try (Store store = createStore();
InternalEngine engine = createEngine(config(defaultSettings, store, createTempDir(), newMergePolicy(), null))) {
final int numIndexingThreads = scaledRandomIntBetween(2, 4);
final int numDocsPerThread = randomIntBetween(500, 1000);
final CyclicBarrier barrier = new CyclicBarrier(numIndexingThreads + 1);
final List<Thread> indexingThreads = new ArrayList<>();
final CountDownLatch doneLatch = new CountDownLatch(numIndexingThreads);
// create N indexing threads to index documents simultaneously
for (int threadNum = 0; threadNum < numIndexingThreads; threadNum++) {
final int threadIdx = threadNum;
Thread indexingThread = new Thread(() -> {
try {
// wait for all threads to start at the same time
barrier.await();
// index random number of docs
for (int i = 0; i < numDocsPerThread; i++) {
final String id = "thread" + threadIdx + "#" + i;
ParsedDocument doc = testParsedDocument(id, null, testDocument(), B_1, null);
engine.index(indexForDoc(doc));
}
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
doneLatch.countDown();
}
});
indexingThreads.add(indexingThread);
}
// start the indexing threads
for (Thread thread : indexingThreads) {
thread.start();
}
// wait for indexing threads to all be ready to start
barrier.await();
int commitLimit = randomIntBetween(10, 20);
long sleepTime = 1;
// create random commit points
boolean doneIndexing;
do {
doneIndexing = doneLatch.await(sleepTime, TimeUnit.MILLISECONDS);
commits.add(engine.acquireLastIndexCommit(true));
if (commits.size() > commitLimit) {
// don't keep on piling up too many commits
IOUtils.close(commits.remove(randomIntBetween(0, commits.size() - 1)));
// we increase the wait time to make sure we eventually if things are slow wait for threads to finish.
// this will reduce pressure on disks and will allow threads to make progress without piling up too many commits
sleepTime = sleepTime * 2;
}
} while (doneIndexing == false);
// now, verify all the commits have the correct docs according to the user commit data
long prevLocalCheckpoint = SequenceNumbers.NO_OPS_PERFORMED;
long prevMaxSeqNo = SequenceNumbers.NO_OPS_PERFORMED;
for (Engine.IndexCommitRef commitRef : commits) {
final IndexCommit commit = commitRef.getIndexCommit();
Map<String, String> userData = commit.getUserData();
long localCheckpoint = userData.containsKey(SequenceNumbers.LOCAL_CHECKPOINT_KEY) ? Long.parseLong(userData.get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)) : SequenceNumbers.NO_OPS_PERFORMED;
long maxSeqNo = userData.containsKey(SequenceNumbers.MAX_SEQ_NO) ? Long.parseLong(userData.get(SequenceNumbers.MAX_SEQ_NO)) : UNASSIGNED_SEQ_NO;
// local checkpoint and max seq no shouldn't go backwards
assertThat(localCheckpoint, greaterThanOrEqualTo(prevLocalCheckpoint));
assertThat(maxSeqNo, greaterThanOrEqualTo(prevMaxSeqNo));
try (IndexReader reader = DirectoryReader.open(commit)) {
Long highest = getHighestSeqNo(reader);
final long highestSeqNo;
if (highest != null) {
highestSeqNo = highest.longValue();
} else {
highestSeqNo = SequenceNumbers.NO_OPS_PERFORMED;
}
// make sure localCheckpoint <= highest seq no found <= maxSeqNo
assertThat(highestSeqNo, greaterThanOrEqualTo(localCheckpoint));
assertThat(highestSeqNo, lessThanOrEqualTo(maxSeqNo));
// make sure all sequence numbers up to and including the local checkpoint are in the index
FixedBitSet seqNosBitSet = getSeqNosSet(reader, highestSeqNo);
for (int i = 0; i <= localCheckpoint; i++) {
assertTrue("local checkpoint [" + localCheckpoint + "], _seq_no [" + i + "] should be indexed", seqNosBitSet.get(i));
}
}
prevLocalCheckpoint = localCheckpoint;
prevMaxSeqNo = maxSeqNo;
}
}
}
use of org.apache.lucene.util.FixedBitSet in project OpenGrok by OpenGrok.
the class CustomSloppyPhraseScorer method gatherRptGroups.
/**
* Detect repetition groups. Done once - for first doc
*/
private ArrayList<ArrayList<PhrasePositions>> gatherRptGroups(LinkedHashMap<Term, Integer> rptTerms) throws IOException {
PhrasePositions[] rpp = repeatingPPs(rptTerms);
ArrayList<ArrayList<PhrasePositions>> res = new ArrayList<>();
if (!hasMultiTermRpts) {
// simpler - no multi-terms - can base on positions in first doc
for (int i = 0; i < rpp.length; i++) {
PhrasePositions pp = rpp[i];
if (pp.rptGroup >= 0) {
// already marked as a repetition
continue;
}
int tpPos = tpPos(pp);
for (int j = i + 1; j < rpp.length; j++) {
PhrasePositions pp2 = rpp[j];
if (// already marked as a repetition
pp2.rptGroup >= 0 || // not a repetition: two PPs are originally in same offset in the query!
pp2.offset == pp.offset || tpPos(pp2) != tpPos) {
// not a repetition
continue;
}
// a repetition
int g = pp.rptGroup;
if (g < 0) {
g = res.size();
pp.rptGroup = g;
ArrayList<PhrasePositions> rl = new ArrayList<>(2);
rl.add(pp);
res.add(rl);
}
pp2.rptGroup = g;
res.get(g).add(pp2);
}
}
} else {
// more involved - has multi-terms
ArrayList<HashSet<PhrasePositions>> tmp = new ArrayList<>();
ArrayList<FixedBitSet> bb = ppTermsBitSets(rpp, rptTerms);
unionTermGroups(bb);
HashMap<Term, Integer> tg = termGroups(rptTerms, bb);
HashSet<Integer> distinctGroupIDs = new HashSet<>(tg.values());
for (int i = 0; i < distinctGroupIDs.size(); i++) {
tmp.add(new HashSet<>());
}
for (PhrasePositions pp : rpp) {
for (Term t : pp.terms) {
if (rptTerms.containsKey(t)) {
int g = tg.get(t);
tmp.get(g).add(pp);
assert pp.rptGroup == -1 || pp.rptGroup == g;
pp.rptGroup = g;
}
}
}
for (HashSet<PhrasePositions> hs : tmp) {
res.add(new ArrayList<>(hs));
}
}
return res;
}
Aggregations