use of datawave.query.index.lookup.IndexInfo in project datawave by NationalSecurityAgency.
the class RangeStreamScanner method scannerInvariant.
protected int scannerInvariant(final Iterator<Entry<Key, Value>> iter) {
PeekingIterator<Entry<Key, Value>> kvIter = new PeekingIterator<>(iter);
int retrievalCount = 0;
Entry<Key, Value> myEntry;
String currentDay = null;
if (null != prevDay) {
try {
if (log.isTraceEnabled())
log.trace("Attempting to insert " + prevDay);
if (!resultQueue.offer(prevDay, 1, TimeUnit.SECONDS)) {
return 0;
}
prevDay = null;
} catch (InterruptedException e) {
return 0;
}
}
// produces stats for us, so we don't have to!
DescriptiveStatistics stats = new DescriptiveStatistics();
writeLock.lock();
try {
while (kvIter.hasNext()) {
Entry<Key, Value> currentKeyValue = kvIter.peek();
// become a pass-through if we've seen an unexpected key.
if (seenUnexpectedKey) {
currentQueue.add(trimTrailingUnderscore(currentKeyValue));
break;
}
if (null == currentDay) {
if (log.isTraceEnabled()) {
log.trace("it's a new day!");
log.trace("adding " + currentKeyValue.getKey() + " to queue because it matches" + currentDay);
}
currentDay = getDay(currentKeyValue.getKey());
currentQueue.add(trimTrailingUnderscore(currentKeyValue));
lastSeenKey = kvIter.next().getKey();
} else {
String nextKeysDay = getDay(currentKeyValue.getKey());
if (currentDay.equals(nextKeysDay)) {
if (log.isTraceEnabled()) {
log.trace("adding " + currentKeyValue.getKey() + " to queue because it matches" + currentDay);
}
IndexInfo info = readInfoFromValue(currentKeyValue.getValue());
if (log.isTraceEnabled()) {
log.trace("adding count of " + info.count());
}
stats.addValue(info.count());
if (currentQueue.size() <= shardsPerDayThreshold || stats.getPercentile(50) < MAX_MEDIAN) {
if (log.isTraceEnabled()) {
log.trace("adding our stats are " + stats.getPercentile(50) + " on " + currentQueue.size());
}
currentQueue.add(trimTrailingUnderscore(currentKeyValue));
} else {
if (log.isTraceEnabled()) {
log.trace("breaking because our stats are " + stats.getPercentile(50) + " on " + currentQueue.size());
}
break;
}
lastSeenKey = kvIter.next().getKey();
} else {
int dequeueCount = dequeue();
retrievalCount += dequeueCount;
int queueSize = currentQueue.size();
dequeue(true);
currentDay = null;
if (dequeueCount != queueSize || retrievalCount <= Math.ceil(maxResults * 1.5)) {
break;
}
}
}
}
if (currentQueue.size() >= shardsPerDayThreshold && stats.getPercentile(50) > MAX_MEDIAN) {
Entry<Key, Value> top = currentQueue.poll();
Key topKey = top.getKey();
if (log.isTraceEnabled())
log.trace(topKey + " for " + currentDay + " exceeds limit of " + shardsPerDayThreshold + " with " + currentQueue.size());
Key newKey = new Key(topKey.getRow(), topKey.getColumnFamily(), new Text(currentDay), topKey.getColumnVisibility(), topKey.getTimestamp());
Value newValue = writeInfoToValue();
myEntry = Maps.immutableEntry(newKey, newValue);
lastSeenKey = newKey;
try {
if (!resultQueue.offer(myEntry, 1, TimeUnit.SECONDS)) {
if (log.isTraceEnabled()) {
log.trace("could not add day! converting " + myEntry + " to " + prevDay);
}
prevDay = myEntry;
}
} catch (InterruptedException exception) {
prevDay = myEntry;
}
currentQueue.clear();
} else {
retrievalCount += dequeue();
}
} finally {
writeLock.unlock();
}
return retrievalCount;
}
use of datawave.query.index.lookup.IndexInfo in project datawave by NationalSecurityAgency.
the class RangeStreamScannerTest method testTheSimplestOfScans.
/**
* Make sure a simple scan returns correctly. FOO == 'bar' hits day 20190314 with 1 shard, each shard has 2 document ids.
*/
@Test
public void testTheSimplestOfScans() throws Exception {
// Components that define the query: "FOO == 'bar'"
String fieldName = "FOO";
String fieldValue = "bar";
ASTEQNode eqNode = (ASTEQNode) JexlNodeFactory.buildEQNode(fieldName, fieldValue);
// Construct a ScannerStream from RangeStreamScanner, iterator, entry parser.
RangeStreamScanner rangeStreamScanner = buildRangeStreamScanner(fieldName, fieldValue);
EntryParser entryParser = new EntryParser(eqNode, fieldName, fieldValue, config.getIndexedFields());
ScannerStream scannerStream = ScannerStream.initialized(rangeStreamScanner, entryParser, eqNode);
// Assert the iterator correctly iterates over the iterables without irritating the unit test.
assertTrue(scannerStream.hasNext());
int shardCount = 0;
int documentCount = 0;
while (scannerStream.hasNext()) {
Tuple2<String, IndexInfo> entry = scannerStream.next();
assertEquals("Expected shard to start with '20190314' but was: " + entry.first(), "20190314", entry.first());
assertEquals(2, entry.second().count());
shardCount++;
documentCount += entry.second().count();
}
assertEquals(1, shardCount);
assertEquals(2, documentCount);
assertFalse(scannerStream.hasNext());
}
use of datawave.query.index.lookup.IndexInfo in project datawave by NationalSecurityAgency.
the class RangeStreamScannerTest method testExceedShardsPerDayThresholdAndDocumentsPerShardThreshold.
/**
* FOO == 'boohoo' hits day 20190319 with 15 shards, each shard has 25 document ids.
*/
@Test
public void testExceedShardsPerDayThresholdAndDocumentsPerShardThreshold() throws Exception {
// Components that define the query: "FOO == 'boohoo'"
String fieldName = "FOO";
String fieldValue = "boohoo";
ASTEQNode eqNode = (ASTEQNode) JexlNodeFactory.buildEQNode(fieldName, fieldValue);
// Construct a ScannerStream from RangeStreamScanner, iterator, entry parser.
RangeStreamScanner rangeStreamScanner = buildRangeStreamScanner(fieldName, fieldValue);
EntryParser entryParser = new EntryParser(eqNode, fieldName, fieldValue, config.getIndexedFields());
// Iterator<Tuple2<String,IndexInfo>> iterator = Iterators.transform(rangeStreamScanner, entryParser);
ScannerStream scannerStream = ScannerStream.initialized(rangeStreamScanner, entryParser, eqNode);
// Assert the iterator correctly iterates over the iterables without irritating the unit test.
assertTrue(scannerStream.hasNext());
int shardCount = 0;
int documentCount = 0;
while (scannerStream.hasNext()) {
Tuple2<String, IndexInfo> entry = scannerStream.next();
assertTrue("Expected shard to start with '20190323' but was: " + entry.first(), entry.first().startsWith("20190323"));
shardCount++;
documentCount += entry.second().count();
}
// A single range with a count of -1 means the shard ranges were collapsed into a day range.
assertEquals(1, shardCount);
assertEquals(-1, documentCount);
assertFalse(scannerStream.hasNext());
}
use of datawave.query.index.lookup.IndexInfo in project datawave by NationalSecurityAgency.
the class RangeStreamScannerTest method testExceedMaxMedianDocumentsPerShard.
/**
* FOO == 'boo' hits day 20190319 with 8 shards, each shard has 15 document ids.
*/
@Test
public void testExceedMaxMedianDocumentsPerShard() throws Exception {
// Components that define the query: "FOO == 'boo'"
String fieldName = "FOO";
String fieldValue = "boo";
ASTEQNode eqNode = (ASTEQNode) JexlNodeFactory.buildEQNode(fieldName, fieldValue);
// Construct a ScannerStream from RangeStreamScanner, iterator, entry parser.
RangeStreamScanner rangeStreamScanner = buildRangeStreamScanner(fieldName, fieldValue);
EntryParser entryParser = new EntryParser(eqNode, fieldName, fieldValue, config.getIndexedFields());
Iterator<Tuple2<String, IndexInfo>> iterator = Iterators.transform(rangeStreamScanner, entryParser);
ScannerStream scannerStream = ScannerStream.initialized(iterator, eqNode);
// Assert the iterator correctly iterates over the iterables without irritating the unit test.
assertTrue(scannerStream.hasNext());
int shardCount = 0;
int documentCount = 0;
while (scannerStream.hasNext()) {
Tuple2<String, IndexInfo> entry = scannerStream.next();
assertTrue("Expected shard to start with '20190319_' but was: " + entry.first(), entry.first().startsWith("20190319_"));
assertEquals(15, entry.second().count());
shardCount++;
documentCount += entry.second().count();
}
assertEquals(8, shardCount);
assertEquals(120, documentCount);
assertFalse(scannerStream.hasNext());
}
use of datawave.query.index.lookup.IndexInfo in project datawave by NationalSecurityAgency.
the class RangeStreamScannerTest method testExceedShardDayThreshold.
/**
* FOO == 'baz' hits day 20190317 with 15 shards, each shard has 2 document ids.
*/
@Test
public void testExceedShardDayThreshold() throws Exception {
// Components that define the query: "FOO == 'baz'"
String fieldName = "FOO";
String fieldValue = "baz";
ASTEQNode eqNode = (ASTEQNode) JexlNodeFactory.buildEQNode(fieldName, fieldValue);
// Construct a ScannerStream from RangeStreamScanner, iterator, entry parser.
RangeStreamScanner rangeStreamScanner = buildRangeStreamScanner(fieldName, fieldValue);
EntryParser entryParser = new EntryParser(eqNode, fieldName, fieldValue, config.getIndexedFields());
Iterator<Tuple2<String, IndexInfo>> iterator = Iterators.transform(rangeStreamScanner, entryParser);
ScannerStream scannerStream = ScannerStream.initialized(iterator, eqNode);
// Assert the iterator correctly iterates over the iterables without irritating the unit test.
assertTrue(scannerStream.hasNext());
int shardCount = 0;
int documentCount = 0;
while (scannerStream.hasNext()) {
Tuple2<String, IndexInfo> entry = scannerStream.next();
assertTrue("Expected shard to start with '20190317_' but was: " + entry.first(), entry.first().startsWith("20190317_"));
assertEquals(2, entry.second().count());
shardCount++;
documentCount += entry.second().count();
}
assertEquals(15, shardCount);
assertEquals(30, documentCount);
assertFalse(scannerStream.hasNext());
}
Aggregations