Search in sources :

Example 1 with IndexInfo

use of datawave.query.index.lookup.IndexInfo in project datawave by NationalSecurityAgency.

the class RangeStreamScanner method scannerInvariant.

protected int scannerInvariant(final Iterator<Entry<Key, Value>> iter) {
    PeekingIterator<Entry<Key, Value>> kvIter = new PeekingIterator<>(iter);
    int retrievalCount = 0;
    Entry<Key, Value> myEntry;
    String currentDay = null;
    if (null != prevDay) {
        try {
            if (log.isTraceEnabled())
                log.trace("Attempting to insert " + prevDay);
            if (!resultQueue.offer(prevDay, 1, TimeUnit.SECONDS)) {
                return 0;
            }
            prevDay = null;
        } catch (InterruptedException e) {
            return 0;
        }
    }
    // produces stats for us, so we don't have to!
    DescriptiveStatistics stats = new DescriptiveStatistics();
    writeLock.lock();
    try {
        while (kvIter.hasNext()) {
            Entry<Key, Value> currentKeyValue = kvIter.peek();
            // become a pass-through if we've seen an unexpected key.
            if (seenUnexpectedKey) {
                currentQueue.add(trimTrailingUnderscore(currentKeyValue));
                break;
            }
            if (null == currentDay) {
                if (log.isTraceEnabled()) {
                    log.trace("it's a new day!");
                    log.trace("adding " + currentKeyValue.getKey() + " to queue because it matches" + currentDay);
                }
                currentDay = getDay(currentKeyValue.getKey());
                currentQueue.add(trimTrailingUnderscore(currentKeyValue));
                lastSeenKey = kvIter.next().getKey();
            } else {
                String nextKeysDay = getDay(currentKeyValue.getKey());
                if (currentDay.equals(nextKeysDay)) {
                    if (log.isTraceEnabled()) {
                        log.trace("adding " + currentKeyValue.getKey() + " to queue because it matches" + currentDay);
                    }
                    IndexInfo info = readInfoFromValue(currentKeyValue.getValue());
                    if (log.isTraceEnabled()) {
                        log.trace("adding count of " + info.count());
                    }
                    stats.addValue(info.count());
                    if (currentQueue.size() <= shardsPerDayThreshold || stats.getPercentile(50) < MAX_MEDIAN) {
                        if (log.isTraceEnabled()) {
                            log.trace("adding our stats are " + stats.getPercentile(50) + " on " + currentQueue.size());
                        }
                        currentQueue.add(trimTrailingUnderscore(currentKeyValue));
                    } else {
                        if (log.isTraceEnabled()) {
                            log.trace("breaking because our stats are " + stats.getPercentile(50) + " on " + currentQueue.size());
                        }
                        break;
                    }
                    lastSeenKey = kvIter.next().getKey();
                } else {
                    int dequeueCount = dequeue();
                    retrievalCount += dequeueCount;
                    int queueSize = currentQueue.size();
                    dequeue(true);
                    currentDay = null;
                    if (dequeueCount != queueSize || retrievalCount <= Math.ceil(maxResults * 1.5)) {
                        break;
                    }
                }
            }
        }
        if (currentQueue.size() >= shardsPerDayThreshold && stats.getPercentile(50) > MAX_MEDIAN) {
            Entry<Key, Value> top = currentQueue.poll();
            Key topKey = top.getKey();
            if (log.isTraceEnabled())
                log.trace(topKey + " for " + currentDay + " exceeds limit of " + shardsPerDayThreshold + " with " + currentQueue.size());
            Key newKey = new Key(topKey.getRow(), topKey.getColumnFamily(), new Text(currentDay), topKey.getColumnVisibility(), topKey.getTimestamp());
            Value newValue = writeInfoToValue();
            myEntry = Maps.immutableEntry(newKey, newValue);
            lastSeenKey = newKey;
            try {
                if (!resultQueue.offer(myEntry, 1, TimeUnit.SECONDS)) {
                    if (log.isTraceEnabled()) {
                        log.trace("could not add day! converting " + myEntry + " to " + prevDay);
                    }
                    prevDay = myEntry;
                }
            } catch (InterruptedException exception) {
                prevDay = myEntry;
            }
            currentQueue.clear();
        } else {
            retrievalCount += dequeue();
        }
    } finally {
        writeLock.unlock();
    }
    return retrievalCount;
}
Also used : DescriptiveStatistics(org.apache.commons.math3.stat.descriptive.DescriptiveStatistics) Entry(java.util.Map.Entry) Value(org.apache.accumulo.core.data.Value) Text(org.apache.hadoop.io.Text) PeekingIterator(org.apache.accumulo.core.util.PeekingIterator) IndexInfo(datawave.query.index.lookup.IndexInfo) Key(org.apache.accumulo.core.data.Key)

Example 2 with IndexInfo

use of datawave.query.index.lookup.IndexInfo in project datawave by NationalSecurityAgency.

the class RangeStreamScannerTest method testTheSimplestOfScans.

/**
 * Make sure a simple scan returns correctly. FOO == 'bar' hits day 20190314 with 1 shard, each shard has 2 document ids.
 */
@Test
public void testTheSimplestOfScans() throws Exception {
    // Components that define the query: "FOO == 'bar'"
    String fieldName = "FOO";
    String fieldValue = "bar";
    ASTEQNode eqNode = (ASTEQNode) JexlNodeFactory.buildEQNode(fieldName, fieldValue);
    // Construct a ScannerStream from RangeStreamScanner, iterator, entry parser.
    RangeStreamScanner rangeStreamScanner = buildRangeStreamScanner(fieldName, fieldValue);
    EntryParser entryParser = new EntryParser(eqNode, fieldName, fieldValue, config.getIndexedFields());
    ScannerStream scannerStream = ScannerStream.initialized(rangeStreamScanner, entryParser, eqNode);
    // Assert the iterator correctly iterates over the iterables without irritating the unit test.
    assertTrue(scannerStream.hasNext());
    int shardCount = 0;
    int documentCount = 0;
    while (scannerStream.hasNext()) {
        Tuple2<String, IndexInfo> entry = scannerStream.next();
        assertEquals("Expected shard to start with '20190314' but was: " + entry.first(), "20190314", entry.first());
        assertEquals(2, entry.second().count());
        shardCount++;
        documentCount += entry.second().count();
    }
    assertEquals(1, shardCount);
    assertEquals(2, documentCount);
    assertFalse(scannerStream.hasNext());
}
Also used : ASTEQNode(org.apache.commons.jexl2.parser.ASTEQNode) IndexInfo(datawave.query.index.lookup.IndexInfo) EntryParser(datawave.query.index.lookup.EntryParser) ScannerStream(datawave.query.index.lookup.ScannerStream) Test(org.junit.Test)

Example 3 with IndexInfo

use of datawave.query.index.lookup.IndexInfo in project datawave by NationalSecurityAgency.

the class RangeStreamScannerTest method testExceedShardsPerDayThresholdAndDocumentsPerShardThreshold.

/**
 * FOO == 'boohoo' hits day 20190319 with 15 shards, each shard has 25 document ids.
 */
@Test
public void testExceedShardsPerDayThresholdAndDocumentsPerShardThreshold() throws Exception {
    // Components that define the query: "FOO == 'boohoo'"
    String fieldName = "FOO";
    String fieldValue = "boohoo";
    ASTEQNode eqNode = (ASTEQNode) JexlNodeFactory.buildEQNode(fieldName, fieldValue);
    // Construct a ScannerStream from RangeStreamScanner, iterator, entry parser.
    RangeStreamScanner rangeStreamScanner = buildRangeStreamScanner(fieldName, fieldValue);
    EntryParser entryParser = new EntryParser(eqNode, fieldName, fieldValue, config.getIndexedFields());
    // Iterator<Tuple2<String,IndexInfo>> iterator = Iterators.transform(rangeStreamScanner, entryParser);
    ScannerStream scannerStream = ScannerStream.initialized(rangeStreamScanner, entryParser, eqNode);
    // Assert the iterator correctly iterates over the iterables without irritating the unit test.
    assertTrue(scannerStream.hasNext());
    int shardCount = 0;
    int documentCount = 0;
    while (scannerStream.hasNext()) {
        Tuple2<String, IndexInfo> entry = scannerStream.next();
        assertTrue("Expected shard to start with '20190323' but was: " + entry.first(), entry.first().startsWith("20190323"));
        shardCount++;
        documentCount += entry.second().count();
    }
    // A single range with a count of -1 means the shard ranges were collapsed into a day range.
    assertEquals(1, shardCount);
    assertEquals(-1, documentCount);
    assertFalse(scannerStream.hasNext());
}
Also used : ASTEQNode(org.apache.commons.jexl2.parser.ASTEQNode) IndexInfo(datawave.query.index.lookup.IndexInfo) EntryParser(datawave.query.index.lookup.EntryParser) ScannerStream(datawave.query.index.lookup.ScannerStream) Test(org.junit.Test)

Example 4 with IndexInfo

use of datawave.query.index.lookup.IndexInfo in project datawave by NationalSecurityAgency.

the class RangeStreamScannerTest method testExceedMaxMedianDocumentsPerShard.

/**
 * FOO == 'boo' hits day 20190319 with 8 shards, each shard has 15 document ids.
 */
@Test
public void testExceedMaxMedianDocumentsPerShard() throws Exception {
    // Components that define the query: "FOO == 'boo'"
    String fieldName = "FOO";
    String fieldValue = "boo";
    ASTEQNode eqNode = (ASTEQNode) JexlNodeFactory.buildEQNode(fieldName, fieldValue);
    // Construct a ScannerStream from RangeStreamScanner, iterator, entry parser.
    RangeStreamScanner rangeStreamScanner = buildRangeStreamScanner(fieldName, fieldValue);
    EntryParser entryParser = new EntryParser(eqNode, fieldName, fieldValue, config.getIndexedFields());
    Iterator<Tuple2<String, IndexInfo>> iterator = Iterators.transform(rangeStreamScanner, entryParser);
    ScannerStream scannerStream = ScannerStream.initialized(iterator, eqNode);
    // Assert the iterator correctly iterates over the iterables without irritating the unit test.
    assertTrue(scannerStream.hasNext());
    int shardCount = 0;
    int documentCount = 0;
    while (scannerStream.hasNext()) {
        Tuple2<String, IndexInfo> entry = scannerStream.next();
        assertTrue("Expected shard to start with '20190319_' but was: " + entry.first(), entry.first().startsWith("20190319_"));
        assertEquals(15, entry.second().count());
        shardCount++;
        documentCount += entry.second().count();
    }
    assertEquals(8, shardCount);
    assertEquals(120, documentCount);
    assertFalse(scannerStream.hasNext());
}
Also used : ASTEQNode(org.apache.commons.jexl2.parser.ASTEQNode) Tuple2(datawave.query.util.Tuple2) IndexInfo(datawave.query.index.lookup.IndexInfo) EntryParser(datawave.query.index.lookup.EntryParser) ScannerStream(datawave.query.index.lookup.ScannerStream) Test(org.junit.Test)

Example 5 with IndexInfo

use of datawave.query.index.lookup.IndexInfo in project datawave by NationalSecurityAgency.

the class RangeStreamScannerTest method testExceedShardDayThreshold.

/**
 * FOO == 'baz' hits day 20190317 with 15 shards, each shard has 2 document ids.
 */
@Test
public void testExceedShardDayThreshold() throws Exception {
    // Components that define the query: "FOO == 'baz'"
    String fieldName = "FOO";
    String fieldValue = "baz";
    ASTEQNode eqNode = (ASTEQNode) JexlNodeFactory.buildEQNode(fieldName, fieldValue);
    // Construct a ScannerStream from RangeStreamScanner, iterator, entry parser.
    RangeStreamScanner rangeStreamScanner = buildRangeStreamScanner(fieldName, fieldValue);
    EntryParser entryParser = new EntryParser(eqNode, fieldName, fieldValue, config.getIndexedFields());
    Iterator<Tuple2<String, IndexInfo>> iterator = Iterators.transform(rangeStreamScanner, entryParser);
    ScannerStream scannerStream = ScannerStream.initialized(iterator, eqNode);
    // Assert the iterator correctly iterates over the iterables without irritating the unit test.
    assertTrue(scannerStream.hasNext());
    int shardCount = 0;
    int documentCount = 0;
    while (scannerStream.hasNext()) {
        Tuple2<String, IndexInfo> entry = scannerStream.next();
        assertTrue("Expected shard to start with '20190317_' but was: " + entry.first(), entry.first().startsWith("20190317_"));
        assertEquals(2, entry.second().count());
        shardCount++;
        documentCount += entry.second().count();
    }
    assertEquals(15, shardCount);
    assertEquals(30, documentCount);
    assertFalse(scannerStream.hasNext());
}
Also used : ASTEQNode(org.apache.commons.jexl2.parser.ASTEQNode) Tuple2(datawave.query.util.Tuple2) IndexInfo(datawave.query.index.lookup.IndexInfo) EntryParser(datawave.query.index.lookup.EntryParser) ScannerStream(datawave.query.index.lookup.ScannerStream) Test(org.junit.Test)

Aggregations

IndexInfo (datawave.query.index.lookup.IndexInfo)6 EntryParser (datawave.query.index.lookup.EntryParser)4 ScannerStream (datawave.query.index.lookup.ScannerStream)4 ASTEQNode (org.apache.commons.jexl2.parser.ASTEQNode)4 Test (org.junit.Test)4 Tuple2 (datawave.query.util.Tuple2)2 DatawaveFatalQueryException (datawave.query.exceptions.DatawaveFatalQueryException)1 IndexMatch (datawave.query.index.lookup.IndexMatch)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 DataInputStream (java.io.DataInputStream)1 IOException (java.io.IOException)1 Entry (java.util.Map.Entry)1 Key (org.apache.accumulo.core.data.Key)1 Value (org.apache.accumulo.core.data.Value)1 PeekingIterator (org.apache.accumulo.core.util.PeekingIterator)1 DescriptiveStatistics (org.apache.commons.math3.stat.descriptive.DescriptiveStatistics)1 Text (org.apache.hadoop.io.Text)1