use of io.crate.expression.reference.doc.lucene.CollectorContext in project crate by crate.
the class OrderedLuceneBatchIteratorBenchmark method createLuceneBatchIterator.
@Setup
public void createLuceneBatchIterator() throws Exception {
IndexWriter iw = new IndexWriter(new ByteBuffersDirectory(), new IndexWriterConfig(new StandardAnalyzer()));
dummyShardId = new ShardId("dummy", UUIDs.randomBase64UUID(), 1);
columnName = "x";
for (int i = 0; i < 10_000_000; i++) {
Document doc = new Document();
doc.add(new NumericDocValuesField(columnName, i));
iw.addDocument(doc);
}
iw.commit();
iw.forceMerge(1, true);
indexSearcher = new IndexSearcher(DirectoryReader.open(iw, true, true));
collectorContext = new CollectorContext();
reference = new Reference(new ReferenceIdent(new RelationName(Schemas.DOC_SCHEMA_NAME, "dummyTable"), columnName), RowGranularity.DOC, DataTypes.INTEGER, 1, null);
orderBy = new OrderBy(Collections.singletonList(reference), reverseFlags, nullsFirst);
}
use of io.crate.expression.reference.doc.lucene.CollectorContext in project crate by crate.
the class DocValuesGroupByOptimizedIterator method tryOptimize.
@Nullable
static BatchIterator<Row> tryOptimize(Functions functions, IndexShard indexShard, DocTableInfo table, LuceneQueryBuilder luceneQueryBuilder, FieldTypeLookup fieldTypeLookup, DocInputFactory docInputFactory, RoutedCollectPhase collectPhase, CollectTask collectTask) {
if (Symbols.containsColumn(collectPhase.toCollect(), DocSysColumns.SCORE) || Symbols.containsColumn(collectPhase.where(), DocSysColumns.SCORE)) {
return null;
}
Collection<? extends Projection> shardProjections = shardProjections(collectPhase.projections());
GroupProjection groupProjection = getSinglePartialGroupProjection(shardProjections);
if (groupProjection == null) {
return null;
}
ArrayList<Reference> columnKeyRefs = new ArrayList<>(groupProjection.keys().size());
for (var key : groupProjection.keys()) {
var docKeyRef = getKeyRef(collectPhase.toCollect(), key);
if (docKeyRef == null) {
// group by on non-reference
return null;
}
var columnKeyRef = (Reference) DocReferences.inverseSourceLookup(docKeyRef);
var keyFieldType = fieldTypeLookup.get(columnKeyRef.column().fqn());
if (keyFieldType == null || !keyFieldType.hasDocValues()) {
return null;
} else {
columnKeyRefs.add(columnKeyRef);
}
}
// noinspection rawtypes
List<DocValueAggregator> aggregators = DocValuesAggregates.createAggregators(functions, groupProjection.values(), collectPhase.toCollect(), collectTask.txnCtx().sessionSettings().searchPath(), table);
if (aggregators == null) {
return null;
}
ShardId shardId = indexShard.shardId();
SharedShardContext sharedShardContext = collectTask.sharedShardContexts().getOrCreateContext(shardId);
var searcher = sharedShardContext.acquireSearcher("group-by-doc-value-aggregates: " + formatSource(collectPhase));
collectTask.addSearcher(sharedShardContext.readerId(), searcher);
QueryShardContext queryShardContext = sharedShardContext.indexService().newQueryShardContext();
InputFactory.Context<? extends LuceneCollectorExpression<?>> docCtx = docInputFactory.getCtx(collectTask.txnCtx());
List<LuceneCollectorExpression<?>> keyExpressions = new ArrayList<>();
for (var keyRef : columnKeyRefs) {
keyExpressions.add((LuceneCollectorExpression<?>) docCtx.add(keyRef));
}
LuceneQueryBuilder.Context queryContext = luceneQueryBuilder.convert(collectPhase.where(), collectTask.txnCtx(), indexShard.mapperService(), indexShard.shardId().getIndexName(), queryShardContext, table, sharedShardContext.indexService().cache());
if (columnKeyRefs.size() == 1) {
return GroupByIterator.forSingleKey(aggregators, searcher.item(), columnKeyRefs.get(0), keyExpressions, collectTask.getRamAccounting(), collectTask.memoryManager(), collectTask.minNodeVersion(), queryContext.query(), new CollectorContext(sharedShardContext.readerId()));
} else {
return GroupByIterator.forManyKeys(aggregators, searcher.item(), columnKeyRefs, keyExpressions, collectTask.getRamAccounting(), collectTask.memoryManager(), collectTask.minNodeVersion(), queryContext.query(), new CollectorContext(sharedShardContext.readerId()));
}
}
use of io.crate.expression.reference.doc.lucene.CollectorContext in project crate by crate.
the class ReservoirSampler method getSamples.
private Samples getSamples(List<Reference> columns, int maxSamples, DocTableInfo docTable, Random random, Metadata metadata, CoordinatorTxnCtx coordinatorTxnCtx, List<Streamer> streamers, List<Engine.Searcher> searchersToRelease, RamAccounting ramAccounting) {
ramAccounting.addBytes(DataTypes.LONG.fixedSize() * maxSamples);
Reservoir<Long> fetchIdSamples = new Reservoir<>(maxSamples, random);
ArrayList<DocIdToRow> docIdToRowsFunctionPerReader = new ArrayList<>();
long totalNumDocs = 0;
long totalSizeInBytes = 0;
for (String index : docTable.concreteOpenIndices()) {
var indexMetadata = metadata.index(index);
if (indexMetadata == null) {
continue;
}
var indexService = indicesService.indexService(indexMetadata.getIndex());
if (indexService == null) {
continue;
}
var mapperService = indexService.mapperService();
FieldTypeLookup fieldTypeLookup = mapperService::fullName;
var ctx = new DocInputFactory(nodeCtx, new LuceneReferenceResolver(indexService.index().getName(), fieldTypeLookup, docTable.partitionedByColumns())).getCtx(coordinatorTxnCtx);
ctx.add(columns);
List<Input<?>> inputs = ctx.topLevelInputs();
List<? extends LuceneCollectorExpression<?>> expressions = ctx.expressions();
CollectorContext collectorContext = new CollectorContext();
for (LuceneCollectorExpression<?> expression : expressions) {
expression.startCollect(collectorContext);
}
for (IndexShard indexShard : indexService) {
if (!indexShard.routingEntry().primary()) {
continue;
}
try {
Engine.Searcher searcher = indexShard.acquireSearcher("update-table-statistics");
searchersToRelease.add(searcher);
totalNumDocs += searcher.getIndexReader().numDocs();
totalSizeInBytes += indexShard.storeStats().getSizeInBytes();
DocIdToRow docIdToRow = new DocIdToRow(searcher, inputs, expressions);
docIdToRowsFunctionPerReader.add(docIdToRow);
try {
// We do the sampling in 2 phases. First we get the docIds;
// then we retrieve the column values for the sampled docIds.
// we do this in 2 phases because the reservoir sampling might override previously seen
// items and we want to avoid unnecessary disk-lookup
var collector = new ReservoirCollector(fetchIdSamples, searchersToRelease.size() - 1);
searcher.search(new MatchAllDocsQuery(), collector);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
} catch (IllegalIndexShardStateException | AlreadyClosedException ignored) {
}
}
}
var rowAccounting = new RowCellsAccountingWithEstimators(Symbols.typeView(columns), ramAccounting, 0);
ArrayList<Row> records = new ArrayList<>();
for (long fetchId : fetchIdSamples.samples()) {
int readerId = FetchId.decodeReaderId(fetchId);
DocIdToRow docIdToRow = docIdToRowsFunctionPerReader.get(readerId);
Object[] row = docIdToRow.apply(FetchId.decodeDocId(fetchId));
try {
rowAccounting.accountForAndMaybeBreak(row);
} catch (CircuitBreakingException e) {
LOGGER.info("Stopped gathering samples for `ANALYZE` operation because circuit breaker triggered. " + "Generating statistics with {} instead of {} records", records.size(), maxSamples);
break;
}
records.add(new RowN(row));
}
return new Samples(records, streamers, totalNumDocs, totalSizeInBytes);
}
Aggregations