Search in sources :

Example 1 with DataSource

use of com.huawei.boostkit.omnidata.model.datasource.DataSource in project boostkit-bigdata by kunpengcompute.

the class ParquetPageSourceFactory method createParquetPushDownPageSource.

public HivePushDownPageSource createParquetPushDownPageSource(Path path, long start, long length, com.huawei.boostkit.omnidata.model.Predicate predicate) {
    AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext();
    Properties transProperties = new Properties();
    transProperties.put(OMNIDATA_CLIENT_TARGET_LIST, omniDataServerTarget);
    DataSource parquetPushDownDataSource = new com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsParquetDataSource(path.toString(), start, length, false);
    TaskSource readTaskInfo = new TaskSource(parquetPushDownDataSource, predicate, TaskSource.ONE_MEGABYTES);
    DataReader<Page> dataReader = DataReaderFactory.create(transProperties, readTaskInfo, new OpenLooKengDeserializer());
    return new HivePushDownPageSource(dataReader, systemMemoryUsage);
}
Also used : HdfsParquetDataSource.buildHdfsParquetDataSource(io.prestosql.plugin.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) HivePushDownPageSource(io.prestosql.plugin.hive.HivePushDownPageSource) Page(io.prestosql.spi.Page) HiveSessionProperties(io.prestosql.plugin.hive.HiveSessionProperties) Properties(java.util.Properties) AggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) TaskSource(com.huawei.boostkit.omnidata.model.TaskSource) HdfsParquetDataSource.buildHdfsParquetDataSource(io.prestosql.plugin.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) DataSource(com.huawei.boostkit.omnidata.model.datasource.DataSource) ParquetDataSource(io.prestosql.parquet.ParquetDataSource) OpenLooKengDeserializer(com.huawei.boostkit.omnidata.decode.impl.OpenLooKengDeserializer)

Example 2 with DataSource

use of com.huawei.boostkit.omnidata.model.datasource.DataSource in project boostkit-bigdata by kunpengcompute.

the class ParquetPageSourceFactory method createParquetPageSource.

public static ParquetPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, long fileSize, Properties schema, List<HiveColumnHandle> columns, boolean useParquetColumnNames, boolean failOnCorruptedParquetStatistics, DataSize maxReadBlockSize, TypeManager typeManager, TupleDomain<HiveColumnHandle> effectivePredicate, FileFormatDataSourceStats stats, DateTimeZone timeZone) {
    AggregatedMemoryContext systemMemoryContext = newSimpleAggregatedMemoryContext();
    ParquetDataSource dataSource = null;
    DateTimeZone readerTimeZone = timeZone;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
        FSDataInputStream inputStream = hdfsEnvironment.doAs(user, () -> fileSystem.open(path));
        ParquetMetadata parquetMetadata = MetadataReader.readFooter(inputStream, path, fileSize);
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        dataSource = buildHdfsParquetDataSource(inputStream, path, fileSize, stats);
        String writerTimeZoneId = fileMetaData.getKeyValueMetaData().get(WRITER_TIME_ZONE_KEY);
        if (writerTimeZoneId != null && !writerTimeZoneId.equalsIgnoreCase(readerTimeZone.getID())) {
            readerTimeZone = DateTimeZone.forID(writerTimeZoneId);
        }
        List<org.apache.parquet.schema.Type> fields = columns.stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getParquetType(column, fileSchema, useParquetColumnNames)).filter(Objects::nonNull).collect(toList());
        MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
        ImmutableList.Builder<BlockMetaData> footerBlocks = ImmutableList.builder();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= start && firstDataPage < start + length) {
                footerBlocks.add(block);
            }
        }
        Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
        TupleDomain<ColumnDescriptor> parquetTupleDomain = getParquetTupleDomain(descriptorsByPath, effectivePredicate);
        Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath);
        final ParquetDataSource finalDataSource = dataSource;
        ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
        for (BlockMetaData block : footerBlocks.build()) {
            if (predicateMatches(parquetPredicate, block, finalDataSource, descriptorsByPath, parquetTupleDomain, failOnCorruptedParquetStatistics)) {
                blocks.add(block);
            }
        }
        MessageColumnIO messageColumnIO = getColumnIO(fileSchema, requestedSchema);
        ParquetReader parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumnIO, blocks.build(), dataSource, readerTimeZone, systemMemoryContext, maxReadBlockSize);
        return new ParquetPageSource(parquetReader, fileSchema, messageColumnIO, typeManager, schema, columns, effectivePredicate, useParquetColumnNames);
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        if (e instanceof ParquetCorruptionException) {
            throw new PrestoException(HIVE_BAD_DATA, e);
        }
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e instanceof BlockMissingException) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) ParquetTypeUtils.getColumnIO(io.prestosql.parquet.ParquetTypeUtils.getColumnIO) HiveSessionProperties.isUseParquetColumnNames(io.prestosql.plugin.hive.HiveSessionProperties.isUseParquetColumnNames) FileSystem(org.apache.hadoop.fs.FileSystem) HivePartitionKey(io.prestosql.plugin.hive.HivePartitionKey) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle) DataReader(com.huawei.boostkit.omnidata.reader.DataReader) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) RichColumnDescriptor(io.prestosql.parquet.RichColumnDescriptor) ParquetTypeUtils.getParquetTypeByName(io.prestosql.parquet.ParquetTypeUtils.getParquetTypeByName) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) PredicateUtils.buildPredicate(io.prestosql.parquet.predicate.PredicateUtils.buildPredicate) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) HiveConfig(io.prestosql.plugin.hive.HiveConfig) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) AggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext) Path(org.apache.hadoop.fs.Path) OMNIDATA_CLIENT_TARGET_LIST(com.huawei.boostkit.omnidata.transfer.OmniDataProperty.OMNIDATA_CLIENT_TARGET_LIST) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) DataReaderFactory(com.huawei.boostkit.omnidata.reader.DataReaderFactory) HiveSessionProperties.isFailOnCorruptedParquetStatistics(io.prestosql.plugin.hive.HiveSessionProperties.isFailOnCorruptedParquetStatistics) PageSourceUtil.buildPushdownContext(io.prestosql.plugin.hive.util.PageSourceUtil.buildPushdownContext) PrestoException(io.prestosql.spi.PrestoException) HdfsParquetDataSource.buildHdfsParquetDataSource(io.prestosql.plugin.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) Set(java.util.Set) HivePushDownPageSource(io.prestosql.plugin.hive.HivePushDownPageSource) MetadataReader(io.prestosql.parquet.reader.MetadataReader) FileFormatDataSourceStats(io.prestosql.plugin.hive.FileFormatDataSourceStats) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) DataSource(com.huawei.boostkit.omnidata.model.datasource.DataSource) Objects(java.util.Objects) MessageType(org.apache.parquet.schema.MessageType) DataSize(io.airlift.units.DataSize) List(java.util.List) HiveOffloadExpression(io.prestosql.plugin.hive.HiveOffloadExpression) ConnectorPageSource(io.prestosql.spi.connector.ConnectorPageSource) HiveUtil.shouldUseRecordReaderFromInputFormat(io.prestosql.plugin.hive.HiveUtil.shouldUseRecordReaderFromInputFormat) HIVE_BAD_DATA(io.prestosql.plugin.hive.HiveErrorCode.HIVE_BAD_DATA) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) PredicateUtils.predicateMatches(io.prestosql.parquet.predicate.PredicateUtils.predicateMatches) Domain(io.prestosql.spi.predicate.Domain) Entry(java.util.Map.Entry) Optional(java.util.Optional) IndexMetadata(io.prestosql.spi.heuristicindex.IndexMetadata) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) SplitMetadata(io.prestosql.spi.heuristicindex.SplitMetadata) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) HIVE_CANNOT_OPEN_SPLIT(io.prestosql.plugin.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) Strings.nullToEmpty(com.google.common.base.Strings.nullToEmpty) ParquetReader(io.prestosql.parquet.reader.ParquetReader) HiveSessionProperties(io.prestosql.plugin.hive.HiveSessionProperties) ParquetTypeUtils.getDescriptors(io.prestosql.parquet.ParquetTypeUtils.getDescriptors) OptionalInt(java.util.OptionalInt) TaskSource(com.huawei.boostkit.omnidata.model.TaskSource) Predicate(io.prestosql.parquet.predicate.Predicate) Inject(javax.inject.Inject) HiveUtil.getDeserializerClassName(io.prestosql.plugin.hive.HiveUtil.getDeserializerClassName) HdfsEnvironment(io.prestosql.plugin.hive.HdfsEnvironment) HIVE_MISSING_DATA(io.prestosql.plugin.hive.HiveErrorCode.HIVE_MISSING_DATA) REGULAR(io.prestosql.plugin.hive.HiveColumnHandle.ColumnType.REGULAR) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) DynamicFilterSupplier(io.prestosql.spi.dynamicfilter.DynamicFilterSupplier) HivePageSourceFactory(io.prestosql.plugin.hive.HivePageSourceFactory) Properties(java.util.Properties) DeleteDeltaLocations(io.prestosql.plugin.hive.DeleteDeltaLocations) TupleDomain(io.prestosql.spi.predicate.TupleDomain) TypeManager(io.prestosql.spi.type.TypeManager) Page(io.prestosql.spi.Page) IOException(java.io.IOException) PRIMITIVE(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.PRIMITIVE) ParquetDataSource(io.prestosql.parquet.ParquetDataSource) Collectors.toList(java.util.stream.Collectors.toList) HiveSessionProperties.getParquetMaxReadBlockSize(io.prestosql.plugin.hive.HiveSessionProperties.getParquetMaxReadBlockSize) ParquetCorruptionException(io.prestosql.parquet.ParquetCorruptionException) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) OpenLooKengDeserializer(com.huawei.boostkit.omnidata.decode.impl.OpenLooKengDeserializer) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ImmutableList(com.google.common.collect.ImmutableList) RichColumnDescriptor(io.prestosql.parquet.RichColumnDescriptor) FileNotFoundException(java.io.FileNotFoundException) PrestoException(io.prestosql.spi.PrestoException) MessageColumnIO(org.apache.parquet.io.MessageColumnIO) PredicateUtils.buildPredicate(io.prestosql.parquet.predicate.PredicateUtils.buildPredicate) Predicate(io.prestosql.parquet.predicate.Predicate) ParquetCorruptionException(io.prestosql.parquet.ParquetCorruptionException) FileSystem(org.apache.hadoop.fs.FileSystem) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) MessageType(org.apache.parquet.schema.MessageType) HdfsParquetDataSource.buildHdfsParquetDataSource(io.prestosql.plugin.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) ParquetDataSource(io.prestosql.parquet.ParquetDataSource) RichColumnDescriptor(io.prestosql.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetReader(io.prestosql.parquet.reader.ParquetReader) IOException(java.io.IOException) AggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) DateTimeZone(org.joda.time.DateTimeZone) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) PrestoException(io.prestosql.spi.PrestoException) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) ParquetCorruptionException(io.prestosql.parquet.ParquetCorruptionException) MessageType(org.apache.parquet.schema.MessageType) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Example 3 with DataSource

use of com.huawei.boostkit.omnidata.model.datasource.DataSource in project boostkit-bigdata by kunpengcompute.

the class OrcPageSourceFactory method createOrcPushDownPageSource.

public HivePushDownPageSource createOrcPushDownPageSource(Path path, long start, long length, Predicate predicate) {
    AggregatedMemoryContext systemMemoryUsage = newSimpleAggregatedMemoryContext();
    Properties transProperties = new Properties();
    transProperties.put(OMNIDATA_CLIENT_TARGET_LIST, omniDataServerTarget);
    DataSource orcPushDownDataSource = new com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsOrcDataSource(path.toString(), start, length, false);
    TaskSource readTaskInfo = new TaskSource(orcPushDownDataSource, predicate, TaskSource.ONE_MEGABYTES);
    DataReader<Page> dataReader = DataReaderFactory.create(transProperties, readTaskInfo, new OpenLooKengDeserializer());
    return new HivePushDownPageSource(dataReader, systemMemoryUsage);
}
Also used : HivePushDownPageSource(io.prestosql.plugin.hive.HivePushDownPageSource) Page(io.prestosql.spi.Page) OrcCacheProperties(io.prestosql.orc.OrcCacheProperties) Properties(java.util.Properties) HiveSessionProperties(io.prestosql.plugin.hive.HiveSessionProperties) AggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) TaskSource(com.huawei.boostkit.omnidata.model.TaskSource) DataSource(com.huawei.boostkit.omnidata.model.datasource.DataSource) OrcDataSource(io.prestosql.orc.OrcDataSource) OpenLooKengDeserializer(com.huawei.boostkit.omnidata.decode.impl.OpenLooKengDeserializer)

Example 4 with DataSource

use of com.huawei.boostkit.omnidata.model.datasource.DataSource in project boostkit-bigdata by kunpengcompute.

the class DataIoAdapter method getPageIterator.

/**
 * Contact with Omni-Data-Server
 * @param pageCandidate file split info
 * @param sparkOutPut data schema
 * @param partitionColumn partition column
 * @param filterOutPut filter schema
 * @param pushDownOperators push down expressions
 * @return WritableColumnVector data result info
 * @throws TaskExecutionException connect to omni-data-server failed exception
 * @notice 3rd parties api throws Exception, function has to catch basic Exception
 */
public Iterator<WritableColumnVector[]> getPageIterator(PageCandidate pageCandidate, Seq<Attribute> sparkOutPut, Seq<Attribute> partitionColumn, Seq<Attribute> filterOutPut, PushDownInfo pushDownOperators) throws TaskExecutionException, UnknownHostException {
    // initCandidates
    initCandidates(pageCandidate, filterOutPut);
    // create AggregationInfo
    // init agg candidates
    List<Attribute> partitionColumnBatch = JavaConverters.seqAsJavaList(partitionColumn);
    for (Attribute attribute : partitionColumnBatch) {
        partitionColumnName.add(attribute.name());
    }
    List<AggExeInfo> aggExecutionList = JavaConverters.seqAsJavaList(pushDownOperators.aggExecutions());
    if (aggExecutionList.size() == 0) {
        initColumnInfo(sparkOutPut);
    }
    DataSource dataSource = initDataSource(pageCandidate);
    RowExpression rowExpression = initFilter(pushDownOperators.filterExecutions());
    Optional<RowExpression> prestoFilter = rowExpression == null ? Optional.empty() : Optional.of(rowExpression);
    Optional<AggregationInfo> aggregations = initAggAndGroupInfo(aggExecutionList);
    // create limitLong
    OptionalLong limitLong = NdpUtils.convertLimitExeInfo(pushDownOperators.limitExecution());
    Predicate predicate = new Predicate(omnidataTypes, omnidataColumns, prestoFilter, omnidataProjections, ImmutableMap.of(), ImmutableMap.of(), aggregations, limitLong);
    TaskSource taskSource = new TaskSource(dataSource, predicate, 1048576);
    SparkDeserializer deserializer = initSparkDeserializer();
    WritableColumnVector[] page = null;
    int failedTimes = 0;
    String[] sdiHostArray = pageCandidate.getSdiHosts().split(",");
    int randomIndex = (int) (Math.random() * sdiHostArray.length);
    Iterator<String> sdiHosts = Arrays.stream(sdiHostArray).iterator();
    Set<String> sdiHostSet = new HashSet<>();
    sdiHostSet.add(sdiHostArray[randomIndex]);
    while (sdiHosts.hasNext()) {
        String sdiHost;
        if (failedTimes == 0) {
            sdiHost = sdiHostArray[randomIndex];
        } else {
            sdiHost = sdiHosts.next();
            if (sdiHostSet.contains(sdiHost)) {
                continue;
            }
        }
        String ipAddress = InetAddress.getByName(sdiHost).getHostAddress();
        Properties properties = new Properties();
        properties.put("omnidata.client.target.list", ipAddress);
        LOG.info("Push down node info: [hostname :{} ,ip :{}]", sdiHost, ipAddress);
        try {
            orcDataReader = new DataReaderImpl<SparkDeserializer>(properties, taskSource, deserializer);
            hasNextPage = true;
            page = (WritableColumnVector[]) orcDataReader.getNextPageBlocking();
            if (orcDataReader.isFinished()) {
                orcDataReader.close();
                hasNextPage = false;
            }
            break;
        } catch (OmniDataException omniDataException) {
            OmniErrorCode errorCode = omniDataException.getErrorCode();
            switch(errorCode) {
                case OMNIDATA_INSUFFICIENT_RESOURCES:
                    LOG.warn("OMNIDATA_INSUFFICIENT_RESOURCES: " + "OmniData-server's push down queue is full, " + "begin to find next OmniData-server");
                    break;
                case OMNIDATA_UNSUPPORTED_OPERATOR:
                    LOG.warn("OMNIDATA_UNSUPPORTED_OPERATOR: " + "OmniDataException: exist unsupported operator");
                    break;
                case OMNIDATA_GENERIC_ERROR:
                    LOG.warn("OMNIDATA_GENERIC_ERROR: Current OmniData-server unavailable, " + "begin to find next OmniData-server");
                    break;
                case OMNIDATA_NOT_FOUND:
                    LOG.warn("OMNIDATA_NOT_FOUND: Current OmniData-Server not found, " + "begin to find next OmniData-server");
                    break;
                case OMNIDATA_INVALID_ARGUMENT:
                    LOG.warn("OMNIDATA_INVALID_ARGUMENT: INVALID_ARGUMENT, " + "exist unsupported operator or dataType");
                    break;
                case OMNIDATA_IO_ERROR:
                    LOG.warn("OMNIDATA_IO_ERROR: Current OmniData-Server io exception, " + "begin to find next OmniData-server");
                    break;
                default:
                    LOG.warn("OmniDataException: OMNIDATA_ERROR.");
            }
            LOG.warn("Push down failed node info [hostname :{} ,ip :{}]", sdiHost, ipAddress);
            ++failedTimes;
        } catch (Exception e) {
            LOG.warn("Push down failed node info [hostname :{} ,ip :{}]", sdiHost, ipAddress, e);
            ++failedTimes;
        }
    }
    int retryTime = Math.min(TASK_FAILED_TIMES, sdiHostArray.length);
    if (failedTimes >= retryTime) {
        LOG.warn("No Omni-data-server to Connect, Task has tried {} times.", retryTime);
        throw new TaskExecutionException("No Omni-data-server to Connect");
    }
    List<WritableColumnVector[]> l = new ArrayList<>();
    l.add(page);
    return l.iterator();
}
Also used : Attribute(org.apache.spark.sql.catalyst.expressions.Attribute) ArrayList(java.util.ArrayList) Properties(java.util.Properties) Predicate(com.huawei.boostkit.omnidata.model.Predicate) OmniDataException(com.huawei.boostkit.omnidata.exception.OmniDataException) OmniErrorCode(com.huawei.boostkit.omnidata.exception.OmniErrorCode) AggregationInfo(com.huawei.boostkit.omnidata.model.AggregationInfo) HashSet(java.util.HashSet) SparkDeserializer(com.huawei.boostkit.omnidata.decode.impl.SparkDeserializer) RowExpression(io.prestosql.spi.relation.RowExpression) TaskExecutionException(org.apache.hadoop.hive.ql.exec.TaskExecutionException) OmniDataException(com.huawei.boostkit.omnidata.exception.OmniDataException) UnknownHostException(java.net.UnknownHostException) HdfsParquetDataSource(com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsParquetDataSource) HdfsOrcDataSource(com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsOrcDataSource) DataSource(com.huawei.boostkit.omnidata.model.datasource.DataSource) TaskExecutionException(org.apache.hadoop.hive.ql.exec.TaskExecutionException) AggExeInfo(org.apache.spark.sql.execution.ndp.AggExeInfo) WritableColumnVector(org.apache.spark.sql.execution.vectorized.WritableColumnVector) OptionalLong(java.util.OptionalLong) TaskSource(com.huawei.boostkit.omnidata.model.TaskSource)

Example 5 with DataSource

use of com.huawei.boostkit.omnidata.model.datasource.DataSource in project boostkit-bigdata by kunpengcompute.

the class DataIoAdapter method initDataSource.

private DataSource initDataSource(PageCandidate pageCandidate) throws UnsupportedOperationException {
    DataSource dataSource;
    String fileFormat = pageCandidate.getFileFormat();
    Long fileStartPos = pageCandidate.getStartPos();
    Long fileLen = pageCandidate.getSplitLen();
    if ("ORC".equalsIgnoreCase(fileFormat)) {
        dataSource = new HdfsOrcDataSource(filePath, fileStartPos, fileLen, false);
    } else if ("PARQUET".equalsIgnoreCase(fileFormat)) {
        dataSource = new HdfsParquetDataSource(filePath, fileStartPos, fileLen, false);
    } else {
        throw new UnsupportedOperationException("unsupported data format : " + fileFormat);
    }
    return dataSource;
}
Also used : HdfsParquetDataSource(com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsParquetDataSource) OptionalLong(java.util.OptionalLong) HdfsOrcDataSource(com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsOrcDataSource) HdfsParquetDataSource(com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsParquetDataSource) HdfsOrcDataSource(com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsOrcDataSource) DataSource(com.huawei.boostkit.omnidata.model.datasource.DataSource)

Aggregations

DataSource (com.huawei.boostkit.omnidata.model.datasource.DataSource)5 TaskSource (com.huawei.boostkit.omnidata.model.TaskSource)4 Properties (java.util.Properties)4 OpenLooKengDeserializer (com.huawei.boostkit.omnidata.decode.impl.OpenLooKengDeserializer)3 AggregatedMemoryContext (io.prestosql.memory.context.AggregatedMemoryContext)3 AggregatedMemoryContext.newSimpleAggregatedMemoryContext (io.prestosql.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext)3 HivePushDownPageSource (io.prestosql.plugin.hive.HivePushDownPageSource)3 HiveSessionProperties (io.prestosql.plugin.hive.HiveSessionProperties)3 Page (io.prestosql.spi.Page)3 HdfsOrcDataSource (com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsOrcDataSource)2 HdfsParquetDataSource (com.huawei.boostkit.omnidata.model.datasource.hdfs.HdfsParquetDataSource)2 ParquetDataSource (io.prestosql.parquet.ParquetDataSource)2 HdfsParquetDataSource.buildHdfsParquetDataSource (io.prestosql.plugin.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource)2 OptionalLong (java.util.OptionalLong)2 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)1 Strings.nullToEmpty (com.google.common.base.Strings.nullToEmpty)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 SparkDeserializer (com.huawei.boostkit.omnidata.decode.impl.SparkDeserializer)1