Search in sources :

Example 11 with CollectorContext

use of io.crate.expression.reference.doc.lucene.CollectorContext in project crate by crate.

the class OrderedLuceneBatchIteratorBenchmark method createLuceneBatchIterator.

@Setup
public void createLuceneBatchIterator() throws Exception {
    IndexWriter iw = new IndexWriter(new ByteBuffersDirectory(), new IndexWriterConfig(new StandardAnalyzer()));
    dummyShardId = new ShardId("dummy", UUIDs.randomBase64UUID(), 1);
    columnName = "x";
    for (int i = 0; i < 10_000_000; i++) {
        Document doc = new Document();
        doc.add(new NumericDocValuesField(columnName, i));
        iw.addDocument(doc);
    }
    iw.commit();
    iw.forceMerge(1, true);
    indexSearcher = new IndexSearcher(DirectoryReader.open(iw, true, true));
    collectorContext = new CollectorContext();
    reference = new Reference(new ReferenceIdent(new RelationName(Schemas.DOC_SCHEMA_NAME, "dummyTable"), columnName), RowGranularity.DOC, DataTypes.INTEGER, 1, null);
    orderBy = new OrderBy(Collections.singletonList(reference), reverseFlags, nullsFirst);
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) OrderBy(io.crate.analyze.OrderBy) Reference(io.crate.metadata.Reference) Document(org.apache.lucene.document.Document) ReferenceIdent(io.crate.metadata.ReferenceIdent) ShardId(org.elasticsearch.index.shard.ShardId) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) IndexWriter(org.apache.lucene.index.IndexWriter) ByteBuffersDirectory(org.apache.lucene.store.ByteBuffersDirectory) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) RelationName(io.crate.metadata.RelationName) CollectorContext(io.crate.expression.reference.doc.lucene.CollectorContext) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Setup(org.openjdk.jmh.annotations.Setup)

Example 12 with CollectorContext

use of io.crate.expression.reference.doc.lucene.CollectorContext in project crate by crate.

the class DocValuesGroupByOptimizedIterator method tryOptimize.

@Nullable
static BatchIterator<Row> tryOptimize(Functions functions, IndexShard indexShard, DocTableInfo table, LuceneQueryBuilder luceneQueryBuilder, FieldTypeLookup fieldTypeLookup, DocInputFactory docInputFactory, RoutedCollectPhase collectPhase, CollectTask collectTask) {
    if (Symbols.containsColumn(collectPhase.toCollect(), DocSysColumns.SCORE) || Symbols.containsColumn(collectPhase.where(), DocSysColumns.SCORE)) {
        return null;
    }
    Collection<? extends Projection> shardProjections = shardProjections(collectPhase.projections());
    GroupProjection groupProjection = getSinglePartialGroupProjection(shardProjections);
    if (groupProjection == null) {
        return null;
    }
    ArrayList<Reference> columnKeyRefs = new ArrayList<>(groupProjection.keys().size());
    for (var key : groupProjection.keys()) {
        var docKeyRef = getKeyRef(collectPhase.toCollect(), key);
        if (docKeyRef == null) {
            // group by on non-reference
            return null;
        }
        var columnKeyRef = (Reference) DocReferences.inverseSourceLookup(docKeyRef);
        var keyFieldType = fieldTypeLookup.get(columnKeyRef.column().fqn());
        if (keyFieldType == null || !keyFieldType.hasDocValues()) {
            return null;
        } else {
            columnKeyRefs.add(columnKeyRef);
        }
    }
    // noinspection rawtypes
    List<DocValueAggregator> aggregators = DocValuesAggregates.createAggregators(functions, groupProjection.values(), collectPhase.toCollect(), collectTask.txnCtx().sessionSettings().searchPath(), table);
    if (aggregators == null) {
        return null;
    }
    ShardId shardId = indexShard.shardId();
    SharedShardContext sharedShardContext = collectTask.sharedShardContexts().getOrCreateContext(shardId);
    var searcher = sharedShardContext.acquireSearcher("group-by-doc-value-aggregates: " + formatSource(collectPhase));
    collectTask.addSearcher(sharedShardContext.readerId(), searcher);
    QueryShardContext queryShardContext = sharedShardContext.indexService().newQueryShardContext();
    InputFactory.Context<? extends LuceneCollectorExpression<?>> docCtx = docInputFactory.getCtx(collectTask.txnCtx());
    List<LuceneCollectorExpression<?>> keyExpressions = new ArrayList<>();
    for (var keyRef : columnKeyRefs) {
        keyExpressions.add((LuceneCollectorExpression<?>) docCtx.add(keyRef));
    }
    LuceneQueryBuilder.Context queryContext = luceneQueryBuilder.convert(collectPhase.where(), collectTask.txnCtx(), indexShard.mapperService(), indexShard.shardId().getIndexName(), queryShardContext, table, sharedShardContext.indexService().cache());
    if (columnKeyRefs.size() == 1) {
        return GroupByIterator.forSingleKey(aggregators, searcher.item(), columnKeyRefs.get(0), keyExpressions, collectTask.getRamAccounting(), collectTask.memoryManager(), collectTask.minNodeVersion(), queryContext.query(), new CollectorContext(sharedShardContext.readerId()));
    } else {
        return GroupByIterator.forManyKeys(aggregators, searcher.item(), columnKeyRefs, keyExpressions, collectTask.getRamAccounting(), collectTask.memoryManager(), collectTask.minNodeVersion(), queryContext.query(), new CollectorContext(sharedShardContext.readerId()));
    }
}
Also used : InputFactory(io.crate.expression.InputFactory) DocValueAggregator(io.crate.execution.engine.aggregation.DocValueAggregator) AtomicReference(java.util.concurrent.atomic.AtomicReference) Reference(io.crate.metadata.Reference) ArrayList(java.util.ArrayList) ShardId(org.elasticsearch.index.shard.ShardId) LuceneQueryBuilder(io.crate.lucene.LuceneQueryBuilder) QueryShardContext(org.elasticsearch.index.query.QueryShardContext) CollectorContext(io.crate.expression.reference.doc.lucene.CollectorContext) GroupProjection(io.crate.execution.dsl.projection.GroupProjection) SharedShardContext(io.crate.execution.jobs.SharedShardContext) LuceneCollectorExpression(io.crate.expression.reference.doc.lucene.LuceneCollectorExpression) Nullable(javax.annotation.Nullable)

Example 13 with CollectorContext

use of io.crate.expression.reference.doc.lucene.CollectorContext in project crate by crate.

the class ReservoirSampler method getSamples.

private Samples getSamples(List<Reference> columns, int maxSamples, DocTableInfo docTable, Random random, Metadata metadata, CoordinatorTxnCtx coordinatorTxnCtx, List<Streamer> streamers, List<Engine.Searcher> searchersToRelease, RamAccounting ramAccounting) {
    ramAccounting.addBytes(DataTypes.LONG.fixedSize() * maxSamples);
    Reservoir<Long> fetchIdSamples = new Reservoir<>(maxSamples, random);
    ArrayList<DocIdToRow> docIdToRowsFunctionPerReader = new ArrayList<>();
    long totalNumDocs = 0;
    long totalSizeInBytes = 0;
    for (String index : docTable.concreteOpenIndices()) {
        var indexMetadata = metadata.index(index);
        if (indexMetadata == null) {
            continue;
        }
        var indexService = indicesService.indexService(indexMetadata.getIndex());
        if (indexService == null) {
            continue;
        }
        var mapperService = indexService.mapperService();
        FieldTypeLookup fieldTypeLookup = mapperService::fullName;
        var ctx = new DocInputFactory(nodeCtx, new LuceneReferenceResolver(indexService.index().getName(), fieldTypeLookup, docTable.partitionedByColumns())).getCtx(coordinatorTxnCtx);
        ctx.add(columns);
        List<Input<?>> inputs = ctx.topLevelInputs();
        List<? extends LuceneCollectorExpression<?>> expressions = ctx.expressions();
        CollectorContext collectorContext = new CollectorContext();
        for (LuceneCollectorExpression<?> expression : expressions) {
            expression.startCollect(collectorContext);
        }
        for (IndexShard indexShard : indexService) {
            if (!indexShard.routingEntry().primary()) {
                continue;
            }
            try {
                Engine.Searcher searcher = indexShard.acquireSearcher("update-table-statistics");
                searchersToRelease.add(searcher);
                totalNumDocs += searcher.getIndexReader().numDocs();
                totalSizeInBytes += indexShard.storeStats().getSizeInBytes();
                DocIdToRow docIdToRow = new DocIdToRow(searcher, inputs, expressions);
                docIdToRowsFunctionPerReader.add(docIdToRow);
                try {
                    // We do the sampling in 2 phases. First we get the docIds;
                    // then we retrieve the column values for the sampled docIds.
                    // we do this in 2 phases because the reservoir sampling might override previously seen
                    // items and we want to avoid unnecessary disk-lookup
                    var collector = new ReservoirCollector(fetchIdSamples, searchersToRelease.size() - 1);
                    searcher.search(new MatchAllDocsQuery(), collector);
                } catch (IOException e) {
                    throw new UncheckedIOException(e);
                }
            } catch (IllegalIndexShardStateException | AlreadyClosedException ignored) {
            }
        }
    }
    var rowAccounting = new RowCellsAccountingWithEstimators(Symbols.typeView(columns), ramAccounting, 0);
    ArrayList<Row> records = new ArrayList<>();
    for (long fetchId : fetchIdSamples.samples()) {
        int readerId = FetchId.decodeReaderId(fetchId);
        DocIdToRow docIdToRow = docIdToRowsFunctionPerReader.get(readerId);
        Object[] row = docIdToRow.apply(FetchId.decodeDocId(fetchId));
        try {
            rowAccounting.accountForAndMaybeBreak(row);
        } catch (CircuitBreakingException e) {
            LOGGER.info("Stopped gathering samples for `ANALYZE` operation because circuit breaker triggered. " + "Generating statistics with {} instead of {} records", records.size(), maxSamples);
            break;
        }
        records.add(new RowN(row));
    }
    return new Samples(records, streamers, totalNumDocs, totalSizeInBytes);
}
Also used : DocInputFactory(io.crate.execution.engine.collect.DocInputFactory) ArrayList(java.util.ArrayList) UncheckedIOException(java.io.UncheckedIOException) AlreadyClosedException(org.apache.lucene.store.AlreadyClosedException) RowCellsAccountingWithEstimators(io.crate.breaker.RowCellsAccountingWithEstimators) Input(io.crate.data.Input) LuceneReferenceResolver(io.crate.expression.reference.doc.lucene.LuceneReferenceResolver) CollectorContext(io.crate.expression.reference.doc.lucene.CollectorContext) Engine(org.elasticsearch.index.engine.Engine) IndexShard(org.elasticsearch.index.shard.IndexShard) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) IllegalIndexShardStateException(org.elasticsearch.index.shard.IllegalIndexShardStateException) RowN(io.crate.data.RowN) FieldTypeLookup(io.crate.lucene.FieldTypeLookup) CircuitBreakingException(org.elasticsearch.common.breaker.CircuitBreakingException) Row(io.crate.data.Row)

Aggregations

CollectorContext (io.crate.expression.reference.doc.lucene.CollectorContext)13 Reference (io.crate.metadata.Reference)6 InputFactory (io.crate.expression.InputFactory)5 AtomicReference (java.util.concurrent.atomic.AtomicReference)5 MatchAllDocsQuery (org.apache.lucene.search.MatchAllDocsQuery)5 ShardId (org.elasticsearch.index.shard.ShardId)4 SharedShardContext (io.crate.execution.jobs.SharedShardContext)3 LuceneQueryBuilder (io.crate.lucene.LuceneQueryBuilder)3 ReferenceIdent (io.crate.metadata.ReferenceIdent)3 DocTableInfo (io.crate.metadata.doc.DocTableInfo)3 CrateDummyClusterServiceUnitTest (io.crate.test.integration.CrateDummyClusterServiceUnitTest)3 TestingRowConsumer (io.crate.testing.TestingRowConsumer)3 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)3 Document (org.apache.lucene.document.Document)3 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)3 IndexWriter (org.apache.lucene.index.IndexWriter)3 QueryShardContext (org.elasticsearch.index.query.QueryShardContext)3 Test (org.junit.Test)3 OrderBy (io.crate.analyze.OrderBy)2 RamAccounting (io.crate.breaker.RamAccounting)2