use of org.apache.drill.exec.work.filter.BloomFilterDef in project drill by apache.
the class RuntimeFilterVisitor method generateRuntimeFilter.
/**
* Generate a possible RuntimeFilter of a HashJoinPrel, left some BF parameters of the generated RuntimeFilter
* to be set later.
*
* @param hashJoinPrel
* @return null or a partial information RuntimeFilterDef
*/
private RuntimeFilterDef generateRuntimeFilter(HashJoinPrel hashJoinPrel) {
JoinRelType joinRelType = hashJoinPrel.getJoinType();
JoinInfo joinInfo = hashJoinPrel.analyzeCondition();
boolean allowJoin = (joinInfo.isEqui()) && (joinRelType == JoinRelType.INNER || joinRelType == JoinRelType.RIGHT);
if (!allowJoin) {
return null;
}
// TODO check whether to enable RuntimeFilter according to the NDV percent
/**
* double threshold = 0.5;
* double percent = leftNDV / rightDNV;
* if (percent > threshold ) {
* return null;
* }
*/
List<BloomFilterDef> bloomFilterDefs = new ArrayList<>();
// find the possible left scan node of the left join key
ScanPrel probeSideScanPrel = null;
RelNode left = hashJoinPrel.getLeft();
RelNode right = hashJoinPrel.getRight();
ExchangePrel exchangePrel = findRightExchangePrel(right);
if (exchangePrel == null) {
// can only be BroadcastExchangePrel or HashToRandomExchangePrel
return null;
}
List<String> leftFields = left.getRowType().getFieldNames();
List<String> rightFields = right.getRowType().getFieldNames();
List<Integer> leftKeys = hashJoinPrel.getLeftKeys();
List<Integer> rightKeys = hashJoinPrel.getRightKeys();
RelMetadataQuery metadataQuery = left.getCluster().getMetadataQuery();
int i = 0;
for (Integer leftKey : leftKeys) {
String leftFieldName = leftFields.get(leftKey);
Integer rightKey = rightKeys.get(i++);
String rightFieldName = rightFields.get(rightKey);
// This also avoids the left field of the join condition with a function call.
ScanPrel scanPrel = findLeftScanPrel(leftFieldName, left);
if (scanPrel != null) {
boolean encounteredBlockNode = containBlockNode((Prel) left, scanPrel);
if (encounteredBlockNode) {
continue;
}
// Collect NDV from the Metadata
RelDataType scanRowType = scanPrel.getRowType();
RelDataTypeField field = scanRowType.getField(leftFieldName, true, true);
int index = field.getIndex();
Double ndv = metadataQuery.getDistinctRowCount(scanPrel, ImmutableBitSet.of(index), null);
if (ndv == null) {
// If NDV is not supplied, we use the row count to estimate the ndv.
ndv = left.estimateRowCount(metadataQuery) * 0.1;
}
int bloomFilterSizeInBytes = BloomFilter.optimalNumOfBytes(ndv.longValue(), fpp);
bloomFilterSizeInBytes = bloomFilterSizeInBytes > bloomFilterMaxSizeInBytesDef ? bloomFilterMaxSizeInBytesDef : bloomFilterSizeInBytes;
// left the local parameter to be set later.
BloomFilterDef bloomFilterDef = new BloomFilterDef(bloomFilterSizeInBytes, false, leftFieldName, rightFieldName);
bloomFilterDef.setLeftNDV(ndv);
bloomFilterDefs.add(bloomFilterDef);
toAddRuntimeFilter.add(scanPrel);
probeSideScanPrel = scanPrel;
}
}
if (bloomFilterDefs.size() > 0) {
// left sendToForeman parameter to be set later.
RuntimeFilterDef runtimeFilterDef = new RuntimeFilterDef(true, false, bloomFilterDefs, false, -1);
probeSideScan2hj.put(probeSideScanPrel, hashJoinPrel);
return runtimeFilterDef;
}
return null;
}
use of org.apache.drill.exec.work.filter.BloomFilterDef in project drill by apache.
the class HashJoinBatch method setupHash64.
private void setupHash64(HashTableConfig htConfig) {
LogicalExpression[] keyExprsBuild = new LogicalExpression[htConfig.getKeyExprsBuild().size()];
ErrorCollector collector = new ErrorCollectorImpl();
int i = 0;
for (NamedExpression ne : htConfig.getKeyExprsBuild()) {
LogicalExpression expr = ExpressionTreeMaterializer.materialize(ne.getExpr(), buildBatch, collector, context.getFunctionRegistry());
collector.reportErrors(logger);
if (expr == null) {
continue;
}
keyExprsBuild[i] = expr;
i++;
}
i = 0;
boolean missingField = false;
TypedFieldId[] buildSideTypeFieldIds = new TypedFieldId[keyExprsBuild.length];
for (NamedExpression ne : htConfig.getKeyExprsBuild()) {
SchemaPath schemaPath = (SchemaPath) ne.getExpr();
TypedFieldId typedFieldId = buildBatch.getValueVectorId(schemaPath);
if (typedFieldId == null) {
missingField = true;
break;
}
buildSideTypeFieldIds[i] = typedFieldId;
i++;
}
if (missingField) {
logger.info("As some build side key fields not found, runtime filter was disabled");
enableRuntimeFilter = false;
return;
}
RuntimeFilterDef runtimeFilterDef = popConfig.getRuntimeFilterDef();
List<BloomFilterDef> bloomFilterDefs = runtimeFilterDef.getBloomFilterDefs();
for (BloomFilterDef bloomFilterDef : bloomFilterDefs) {
String buildField = bloomFilterDef.getBuildField();
SchemaPath schemaPath = new SchemaPath(new PathSegment.NameSegment(buildField), ExpressionPosition.UNKNOWN);
TypedFieldId typedFieldId = buildBatch.getValueVectorId(schemaPath);
if (typedFieldId == null) {
missingField = true;
break;
}
int fieldId = typedFieldId.getFieldIds()[0];
bloomFilterDef2buildId.put(bloomFilterDef, fieldId);
}
if (missingField) {
logger.info("As some build side join key fields not found, runtime filter was disabled");
enableRuntimeFilter = false;
return;
}
ValueVectorHashHelper hashHelper = new ValueVectorHashHelper(buildBatch, context);
try {
hash64 = hashHelper.getHash64(keyExprsBuild, buildSideTypeFieldIds);
} catch (Exception e) {
throw UserException.internalError(e).message("Failed to construct a field's hash64 dynamic codes").build(logger);
}
}
use of org.apache.drill.exec.work.filter.BloomFilterDef in project drill by apache.
the class TestHashJoinJPPD method testBroadcastHashJoin1Cond.
@SuppressWarnings("unchecked")
@Test
public void testBroadcastHashJoin1Cond() {
List<BloomFilterDef> bloomFilterDefs = new ArrayList<>();
int numBytes = BloomFilter.optimalNumOfBytes(2600, 0.01);
BloomFilterDef bloomFilterDef = new BloomFilterDef(numBytes, true, "lft", "rgt");
bloomFilterDefs.add(bloomFilterDef);
RuntimeFilterDef runtimeFilterDef = new RuntimeFilterDef(true, false, bloomFilterDefs, false, -1);
HashJoinPOP joinConf = new HashJoinPOP(null, null, Lists.newArrayList(joinCond("lft", "EQUALS", "rgt")), JoinRelType.INNER, runtimeFilterDef);
operatorFixture.getOptionManager().setLocalOption("exec.hashjoin.num_partitions", 4);
operatorFixture.getOptionManager().setLocalOption("exec.hashjoin.num_rows_in_batch", 64);
operatorFixture.getOptionManager().setLocalOption("exec.hashjoin.max_batches_in_memory", 8);
operatorFixture.getOptionManager().setLocalOption("exec.hashjoin.enable.runtime_filter", true);
// Put some duplicate values
List<String> leftTable = Lists.newArrayList("[{\"lft\": 0, \"a\" : \"a string\"}]", "[{\"lft\": 0, \"a\" : \"a different string\"},{\"lft\": 0, \"a\" : \"yet another\"}]");
List<String> rightTable = Lists.newArrayList("[{\"rgt\": 0, \"b\" : \"a string\"}]", "[{\"rgt\": 0, \"b\" : \"a different string\"},{\"rgt\": 0, \"b\" : \"yet another\"}]");
int numRows = 2500;
for (int cnt = 1; cnt <= numRows; cnt++) {
leftTable.add("[{\"lft\": " + cnt + ", \"a\" : \"a string\"}]");
}
legacyOpTestBuilder().physicalOperator(joinConf).inputDataStreamsJson(Lists.newArrayList(leftTable, rightTable)).baselineColumns("lft", "a", "b", "rgt").expectedTotalRows(9).go();
}
use of org.apache.drill.exec.work.filter.BloomFilterDef in project drill by apache.
the class TestHashJoinJPPD method testBroadcastHashJoin2Cond.
@SuppressWarnings("unchecked")
@Test
public void testBroadcastHashJoin2Cond() {
List<BloomFilterDef> bloomFilterDefs = new ArrayList<>();
int numBytes = BloomFilter.optimalNumOfBytes(2600, 0.01);
BloomFilterDef bloomFilterDef = new BloomFilterDef(numBytes, true, "lft", "rgt");
BloomFilterDef bloomFilterDef1 = new BloomFilterDef(numBytes, true, "a", "b");
bloomFilterDefs.add(bloomFilterDef);
bloomFilterDefs.add(bloomFilterDef1);
RuntimeFilterDef runtimeFilterDef = new RuntimeFilterDef(true, false, bloomFilterDefs, false, -1);
HashJoinPOP joinConf = new HashJoinPOP(null, null, Lists.newArrayList(joinCond("lft", "EQUALS", "rgt"), joinCond("a", "EQUALS", "b")), JoinRelType.INNER, runtimeFilterDef);
operatorFixture.getOptionManager().setLocalOption("exec.hashjoin.num_partitions", 4);
operatorFixture.getOptionManager().setLocalOption("exec.hashjoin.num_rows_in_batch", 128);
operatorFixture.getOptionManager().setLocalOption("exec.hashjoin.max_batches_in_memory", 8);
operatorFixture.getOptionManager().setLocalOption("exec.hashjoin.enable.runtime_filter", true);
// Put some duplicate values
List<String> leftTable = Lists.newArrayList("[{\"lft\": 0, \"a\" : \"a string\"}]", "[{\"lft\": 0, \"a\" : \"a different string\"},{\"lft\": 0, \"a\" : \"yet another\"}]");
List<String> rightTable = Lists.newArrayList("[{\"rgt\": 0, \"b\" : \"a string\"}]", "[{\"rgt\": 0, \"b\" : \"a different string\"},{\"rgt\": 0, \"b\" : \"yet another\"}]");
int numRows = 2500;
for (int cnt = 1; cnt <= numRows; cnt++) {
leftTable.add("[{\"lft\": " + cnt + ", \"a\" : \"a string\"}]");
}
legacyOpTestBuilder().physicalOperator(joinConf).inputDataStreamsJson(Lists.newArrayList(leftTable, rightTable)).baselineColumns("lft", "a", "b", "rgt").expectedTotalRows(3).go();
}
use of org.apache.drill.exec.work.filter.BloomFilterDef in project drill by apache.
the class HashJoinBatch method initializeRuntimeFilter.
/**
* Note: This method can not be called again as part of recursive call of
* executeBuildPhase() to handle spilled build partitions.
*/
private void initializeRuntimeFilter() {
if (!enableRuntimeFilter || bloomFiltersGenerated) {
return;
}
runtimeFilterReporter = new RuntimeFilterReporter((ExecutorFragmentContext) context);
RuntimeFilterDef runtimeFilterDef = popConfig.getRuntimeFilterDef();
// RuntimeFilterRouter's judgement will have the RuntimeFilterDef.
if (runtimeFilterDef != null) {
List<BloomFilterDef> bloomFilterDefs = runtimeFilterDef.getBloomFilterDefs();
for (BloomFilterDef bloomFilterDef : bloomFilterDefs) {
int buildFieldId = bloomFilterDef2buildId.get(bloomFilterDef);
int numBytes = bloomFilterDef.getNumBytes();
String probeField = bloomFilterDef.getProbeField();
probeFields.add(probeField);
BloomFilter bloomFilter = new BloomFilter(numBytes, context.getAllocator());
bloomFilters.add(bloomFilter);
bloomFilter2buildId.put(bloomFilter, buildFieldId);
}
}
bloomFiltersGenerated = true;
}
Aggregations