Search in sources :

Example 1 with AggregationInfo

use of com.huawei.boostkit.omnidata.model.AggregationInfo in project boostkit-bigdata by kunpengcompute.

the class DataIoAdapter method createAggregationInfo.

private Optional<AggregationInfo> createAggregationInfo(List<AggregateFunction> aggregateFunctions, List<NamedExpression> namedExpressions) {
    List<RowExpression> groupingKeys = new ArrayList<>();
    Map<String, AggregationInfo.AggregateFunction> aggregationMap = new LinkedHashMap<>();
    boolean isEmpty = true;
    for (NamedExpression namedExpression : namedExpressions) {
        RowExpression groupingKey = extractNamedExpression((Expression) namedExpression);
        groupingKeys.add(groupingKey);
        isEmpty = false;
    }
    for (AggregateFunction aggregateFunction : aggregateFunctions) {
        extractAggregateFunction(aggregateFunction, aggregationMap);
        isEmpty = false;
    }
    return isEmpty ? Optional.empty() : Optional.of(new AggregationInfo(aggregationMap, groupingKeys));
}
Also used : NamedExpression(org.apache.spark.sql.catalyst.expressions.NamedExpression) AggregateFunction(org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction) ArrayList(java.util.ArrayList) RowExpression(io.prestosql.spi.relation.RowExpression) AggregationInfo(com.huawei.boostkit.omnidata.model.AggregationInfo) LinkedHashMap(java.util.LinkedHashMap)

Example 2 with AggregationInfo

use of com.huawei.boostkit.omnidata.model.AggregationInfo in project boostkit-bigdata by kunpengcompute.

the class DataIoAdapter method extractAggAndGroupExpression.

private Optional<AggregationInfo> extractAggAndGroupExpression(List<AggExeInfo> aggExecutionList) {
    Optional<AggregationInfo> resAggregationInfo = Optional.empty();
    for (AggExeInfo aggExeInfo : aggExecutionList) {
        List<AggregateFunction> aggregateExpressions = JavaConverters.seqAsJavaList(aggExeInfo.aggregateExpressions());
        List<NamedExpression> namedExpressions = JavaConverters.seqAsJavaList(aggExeInfo.groupingExpressions());
        resAggregationInfo = createAggregationInfo(aggregateExpressions, namedExpressions);
    }
    return resAggregationInfo;
}
Also used : AggExeInfo(org.apache.spark.sql.execution.ndp.AggExeInfo) NamedExpression(org.apache.spark.sql.catalyst.expressions.NamedExpression) AggregateFunction(org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction) AggregationInfo(com.huawei.boostkit.omnidata.model.AggregationInfo)

Example 3 with AggregationInfo

use of com.huawei.boostkit.omnidata.model.AggregationInfo in project boostkit-bigdata by kunpengcompute.

the class DataIoAdapter method getPageIterator.

/**
 * Contact with Omni-Data-Server
 * @param pageCandidate file split info
 * @param sparkOutPut data schema
 * @param partitionColumn partition column
 * @param filterOutPut filter schema
 * @param pushDownOperators push down expressions
 * @return WritableColumnVector data result info
 * @throws TaskExecutionException connect to omni-data-server failed exception
 * @notice 3rd parties api throws Exception, function has to catch basic Exception
 */
public Iterator<WritableColumnVector[]> getPageIterator(PageCandidate pageCandidate, Seq<Attribute> sparkOutPut, Seq<Attribute> partitionColumn, Seq<Attribute> filterOutPut, PushDownInfo pushDownOperators) throws TaskExecutionException, UnknownHostException {
    // initCandidates
    initCandidates(pageCandidate, filterOutPut);
    // create AggregationInfo
    // init agg candidates
    List<Attribute> partitionColumnBatch = JavaConverters.seqAsJavaList(partitionColumn);
    for (Attribute attribute : partitionColumnBatch) {
        partitionColumnName.add(attribute.name());
    }
    List<AggExeInfo> aggExecutionList = JavaConverters.seqAsJavaList(pushDownOperators.aggExecutions());
    if (aggExecutionList.size() == 0) {
        initColumnInfo(sparkOutPut);
    }
    DataSource dataSource = initDataSource(pageCandidate);
    RowExpression rowExpression = initFilter(pushDownOperators.filterExecutions());
    Optional<RowExpression> prestoFilter = rowExpression == null ? Optional.empty() : Optional.of(rowExpression);
    Optional<AggregationInfo> aggregations = initAggAndGroupInfo(aggExecutionList);
    // create limitLong
    OptionalLong limitLong = NdpUtils.convertLimitExeInfo(pushDownOperators.limitExecution());
    Predicate predicate = new Predicate(omnidataTypes, omnidataColumns, prestoFilter, omnidataProjections, ImmutableMap.of(), ImmutableMap.of(), aggregations, limitLong);
    TaskSource taskSource = new TaskSource(dataSource, predicate, 1048576);
    SparkDeserializer deserializer = initSparkDeserializer();
    WritableColumnVector[] page = null;
    int failedTimes = 0;
    String[] sdiHostArray = pageCandidate.getSdiHosts().split(",");
    int randomIndex = (int) (Math.random() * sdiHostArray.length);
    Iterator<String> sdiHosts = Arrays.stream(sdiHostArray).iterator();
    Set<String> sdiHostSet = new HashSet<>();
    sdiHostSet.add(sdiHostArray[randomIndex]);
    while (sdiHosts.hasNext()) {
        String sdiHost;
        if (failedTimes == 0) {
            sdiHost = sdiHostArray[randomIndex];
        } else {
            sdiHost = sdiHosts.next();
            if (sdiHostSet.contains(sdiHost)) {
                continue;
            }
        }
        String ipAddress = InetAddress.getByName(sdiHost).getHostAddress();
        Properties properties = new Properties();
        properties.put("omnidata.client.target.list", ipAddress);
        LOG.info("Push down node info: [hostname :{} ,ip :{}]", sdiHost, ipAddress);
        try {
            orcDataReader = new DataReaderImpl<SparkDeserializer>(properties, taskSource, deserializer);
            hasNextPage = true;
            page = (WritableColumnVector[]) orcDataReader.getNextPageBlocking();
            if (orcDataReader.isFinished()) {
                orcDataReader.close();
                hasNextPage = false;
            }
            break;
        } catch (OmniDataException omniDataException) {
            OmniErrorCode errorCode = omniDataException.getErrorCode();
            switch(errorCode) {
                case OMNIDATA_INSUFFICIENT_RESOURCES:
                    LOG.warn("OMNIDATA_INSUFFICIENT_RESOURCES: " + "OmniData-server's push down queue is full, " + "begin to find next OmniData-server");
                    break;
                case OMNIDATA_UNSUPPORTED_OPERATOR:
                    LOG.warn("OMNIDATA_UNSUPPORTED_OPERATOR: " + "OmniDataException: exist unsupported operator");
                    break;
                case OMNIDATA_GENERIC_ERROR:
                    LOG.warn("OMNIDATA_GENERIC_ERROR: Current OmniData-server unavailable, " + "begin to find next OmniData-server");
                    break;
                case OMNIDATA_NOT_FOUND:
                    LOG.warn("OMNIDATA_NOT_FOUND: Current OmniData-Server not found, " + "begin to find next OmniData-server");
                    break;
                case OMNIDATA_INVALID_ARGUMENT:
                    LOG.warn("OMNIDATA_INVALID_ARGUMENT: INVALID_ARGUMENT, " + "exist unsupported operator or dataType");
                    break;
                case OMNIDATA_IO_ERROR:
                    LOG.warn("OMNIDATA_IO_ERROR: Current OmniData-Server io exception, " + "begin to find next OmniData-server");
                    break;
                default:
                    LOG.warn("OmniDataException: OMNIDATA_ERROR.");
            }
            LOG.warn("Push down failed node info [hostname :{} ,ip :{}]", sdiHost, ipAddress);
            ++failedTimes;
        } catch (Exception e) {
            LOG.warn("Push down failed node info [hostname :{} ,ip :{}]", sdiHost, ipAddress, e);
            ++failedTimes;
        }
    }
    int retryTime = Math.min(TASK_FAILED_TIMES, sdiHostArray.length);
    if (failedTimes >= retryTime) {
        LOG.warn("No Omni-data-server to Connect, Task has tried {} times.", retryTime);
        throw new TaskExecutionException("No Omni-data-server to Connect");
    }
    List<WritableColumnVector[]> l = new ArrayList<>();
    l.add(page);
    return l.iterator();
}
Also used : Attribute(org.apache.spark.sql.catalyst.expressions.Attribute) ArrayList(java.util.ArrayList) Properties(java.util.Properties) Predicate(com.huawei.boostkit.omnidata.model.Predicate) OmniDataException(com.huawei.boostkit.omnidata.exception.OmniDataException) OmniErrorCode(com.huawei.boostkit.omnidata.exception.OmniErrorCode) AggregationInfo(com.huawei.boostkit.omnidata.model.AggregationInfo) HashSet(java.util.HashSet) SparkDeserializer(com.huawei.boostkit.omnidata.decode.impl.SparkDeserializer) RowExpression(io.prestosql.spi.relation.RowExpression) TaskExecutionException(org.apache.hadoop.hive.ql.exec.TaskExecutionException) OmniDataException(com.huawei.boostkit.omnidata.exception.OmniDataException) UnknownHostException(java.net.UnknownHostException) HdfsParquetDataSource(com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsParquetDataSource) HdfsOrcDataSource(com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsOrcDataSource) DataSource(com.huawei.boostkit.omnidata.model.datasource.DataSource) TaskExecutionException(org.apache.hadoop.hive.ql.exec.TaskExecutionException) AggExeInfo(org.apache.spark.sql.execution.ndp.AggExeInfo) WritableColumnVector(org.apache.spark.sql.execution.vectorized.WritableColumnVector) OptionalLong(java.util.OptionalLong) TaskSource(com.huawei.boostkit.omnidata.model.TaskSource)

Example 4 with AggregationInfo

use of com.huawei.boostkit.omnidata.model.AggregationInfo in project boostkit-bigdata by kunpengcompute.

the class TestHivePartialAggregationPushdown method testPartialAggregationPushdown.

@Test
public void testPartialAggregationPushdown() {
    // select count(x) from table group by x
    TableScanNode tableScanNode = buildTableScanNode(COLUMN_INT);
    AggregationNode aggregationNode = buildCountAggregationNode(tableScanNode);
    ImmutableMap.Builder<String, AggregationInfo.AggregateFunction> aggregationsExpected = new ImmutableMap.Builder<>();
    for (Map.Entry<Symbol, AggregationNode.Aggregation> entry : aggregationNode.getAggregations().entrySet()) {
        AggregationInfo.AggregateFunction aggregateFunction = new AggregationInfo.AggregateFunction(entry.getValue().getFunctionCall(), entry.getValue().isDistinct());
        aggregationsExpected.put(entry.getKey().getName(), aggregateFunction);
    }
    List<RowExpression> groupingKeysExpected = ImmutableList.of(new VariableReferenceExpression(COLUMN_INT.getName(), INTEGER));
    AggregationInfo aggregationInfoExpected = new AggregationInfo(aggregationsExpected.build(), groupingKeysExpected);
    PlanNode outputNode = AGGREGATION_OPTIMIZER.optimize(aggregationNode, OFFLOAD_SESSION, COLUMN_TYPE_MAP, SYMBOL_ALLOCATOR, ID_ALLOCATOR);
    matchAggregatorOffload(outputNode, aggregationInfoExpected);
}
Also used : Symbol(io.prestosql.spi.plan.Symbol) RowExpression(io.prestosql.spi.relation.RowExpression) TestHivePushdownUtil.buildAggregationNode(io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildAggregationNode) AggregationNode(io.prestosql.spi.plan.AggregationNode) ImmutableMap(com.google.common.collect.ImmutableMap) TestHivePushdownUtil.createAggregation(io.prestosql.plugin.hive.rule.TestHivePushdownUtil.createAggregation) PlanNode(io.prestosql.spi.plan.PlanNode) TableScanNode(io.prestosql.spi.plan.TableScanNode) TestHivePushdownUtil.buildTableScanNode(io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildTableScanNode) VariableReferenceExpression(io.prestosql.spi.relation.VariableReferenceExpression) AggregationInfo(com.huawei.boostkit.omnidata.model.AggregationInfo) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.testng.annotations.Test)

Example 5 with AggregationInfo

use of com.huawei.boostkit.omnidata.model.AggregationInfo in project boostkit-bigdata by kunpengcompute.

the class TestHivePushdownUtil method matchAggregatorOffload.

protected static void matchAggregatorOffload(PlanNode node, AggregationInfo aggregationInfoExpected) {
    HiveOffloadExpression expression = getCheckedOffloadExpression(node);
    assertTrue(expression.isPresent());
    assertTrue(expression.getAggregations().isPresent());
    AggregationInfo aggregationInfo = expression.getAggregations().get();
    assertEquals(aggregationInfoExpected, aggregationInfo);
}
Also used : HiveOffloadExpression(io.prestosql.plugin.hive.HiveOffloadExpression) AggregationInfo(com.huawei.boostkit.omnidata.model.AggregationInfo)

Aggregations

AggregationInfo (com.huawei.boostkit.omnidata.model.AggregationInfo)6 RowExpression (io.prestosql.spi.relation.RowExpression)4 ImmutableMap (com.google.common.collect.ImmutableMap)2 ArrayList (java.util.ArrayList)2 Map (java.util.Map)2 NamedExpression (org.apache.spark.sql.catalyst.expressions.NamedExpression)2 AggregateFunction (org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction)2 AggExeInfo (org.apache.spark.sql.execution.ndp.AggExeInfo)2 ImmutableList (com.google.common.collect.ImmutableList)1 SparkDeserializer (com.huawei.boostkit.omnidata.decode.impl.SparkDeserializer)1 OmniDataException (com.huawei.boostkit.omnidata.exception.OmniDataException)1 OmniErrorCode (com.huawei.boostkit.omnidata.exception.OmniErrorCode)1 Predicate (com.huawei.boostkit.omnidata.model.Predicate)1 TaskSource (com.huawei.boostkit.omnidata.model.TaskSource)1 DataSource (com.huawei.boostkit.omnidata.model.datasource.DataSource)1 HdfsOrcDataSource (com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsOrcDataSource)1 HdfsParquetDataSource (com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsParquetDataSource)1 HiveOffloadExpression (io.prestosql.plugin.hive.HiveOffloadExpression)1 TestHivePushdownUtil.buildAggregationNode (io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildAggregationNode)1 TestHivePushdownUtil.buildTableScanNode (io.prestosql.plugin.hive.rule.TestHivePushdownUtil.buildTableScanNode)1