Search in sources :

Example 1 with HdfsRcFileDataSource

use of io.prestosql.plugin.hive.rcfile.HdfsRcFileDataSource in project hetu-core by openlookeng.

the class RcFileFileWriterFactory method createFileWriter.

@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<AcidOutputFormat.Options> acidOptions, Optional<HiveACIDWriteType> acidWriteType) {
    if (!RCFileOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    RcFileEncoding rcFileEncoding;
    if (LazyBinaryColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) {
        rcFileEncoding = new BinaryRcFileEncoding(timeZone);
    } else if (ColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) {
        rcFileEncoding = RcFilePageSourceFactory.createTextVectorEncoding(schema);
    } else {
        return Optional.empty();
    }
    Optional<String> codecName = Optional.ofNullable(configuration.get(FileOutputFormat.COMPRESS_CODEC));
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
        OutputStream outputStream = fileSystem.create(path);
        Optional<Supplier<RcFileDataSource>> validationInputFactory = Optional.empty();
        if (HiveSessionProperties.isRcfileOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsRcFileDataSource(path.toString(), fileSystem.open(path), fileSystem.getFileStatus(path).getLen(), stats);
                } catch (IOException e) {
                    throw new PrestoException(HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        return Optional.of(new RcFileFileWriter(outputStream, rollbackAction, rcFileEncoding, fileColumnTypes, codecName, fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), validationInputFactory));
    } catch (Exception e) {
        throw new PrestoException(HiveErrorCode.HIVE_WRITER_OPEN_ERROR, "Error creating RCFile file", e);
    }
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) StorageFormat(io.prestosql.plugin.hive.metastore.StorageFormat) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) HiveUtil.getColumnTypes(io.prestosql.plugin.hive.HiveUtil.getColumnTypes) Callable(java.util.concurrent.Callable) RcFilePageSourceFactory(io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) HiveUtil.getColumnNames(io.prestosql.plugin.hive.HiveUtil.getColumnNames) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) Type(io.prestosql.spi.type.Type) OutputStream(java.io.OutputStream) PrestoException(io.prestosql.spi.PrestoException) RcFileEncoding(io.prestosql.rcfile.RcFileEncoding) Properties(java.util.Properties) ImmutableMap(com.google.common.collect.ImmutableMap) RCFileOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat) TypeManager(io.prestosql.spi.type.TypeManager) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) IOException(java.io.IOException) BinaryRcFileEncoding(io.prestosql.rcfile.binary.BinaryRcFileEncoding) ColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe) JobConf(org.apache.hadoop.mapred.JobConf) FileOutputFormat(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HdfsRcFileDataSource(io.prestosql.plugin.hive.rcfile.HdfsRcFileDataSource) Optional(java.util.Optional) RcFileDataSource(io.prestosql.rcfile.RcFileDataSource) OutputStream(java.io.OutputStream) PrestoException(io.prestosql.spi.PrestoException) RcFileEncoding(io.prestosql.rcfile.RcFileEncoding) BinaryRcFileEncoding(io.prestosql.rcfile.binary.BinaryRcFileEncoding) IOException(java.io.IOException) PrestoException(io.prestosql.spi.PrestoException) IOException(java.io.IOException) Type(io.prestosql.spi.type.Type) HdfsRcFileDataSource(io.prestosql.plugin.hive.rcfile.HdfsRcFileDataSource) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) Supplier(java.util.function.Supplier) BinaryRcFileEncoding(io.prestosql.rcfile.binary.BinaryRcFileEncoding)

Example 2 with HdfsRcFileDataSource

use of io.prestosql.plugin.hive.rcfile.HdfsRcFileDataSource in project boostkit-bigdata by kunpengcompute.

the class RcFileFileWriterFactory method createFileWriter.

@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<AcidOutputFormat.Options> acidOptions, Optional<HiveACIDWriteType> acidWriteType) {
    if (!RCFileOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    RcFileEncoding rcFileEncoding;
    if (LazyBinaryColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) {
        rcFileEncoding = new BinaryRcFileEncoding(timeZone);
    } else if (ColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) {
        rcFileEncoding = RcFilePageSourceFactory.createTextVectorEncoding(schema);
    } else {
        return Optional.empty();
    }
    Optional<String> codecName = Optional.ofNullable(configuration.get(FileOutputFormat.COMPRESS_CODEC));
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
        OutputStream outputStream = fileSystem.create(path);
        Optional<Supplier<RcFileDataSource>> validationInputFactory = Optional.empty();
        if (HiveSessionProperties.isRcfileOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsRcFileDataSource(path.toString(), fileSystem.open(path), fileSystem.getFileStatus(path).getLen(), stats);
                } catch (IOException e) {
                    throw new PrestoException(HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        return Optional.of(new RcFileFileWriter(outputStream, rollbackAction, rcFileEncoding, fileColumnTypes, codecName, fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), validationInputFactory));
    } catch (Exception e) {
        throw new PrestoException(HiveErrorCode.HIVE_WRITER_OPEN_ERROR, "Error creating RCFile file", e);
    }
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) StorageFormat(io.prestosql.plugin.hive.metastore.StorageFormat) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) HiveUtil.getColumnTypes(io.prestosql.plugin.hive.HiveUtil.getColumnTypes) Callable(java.util.concurrent.Callable) RcFilePageSourceFactory(io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) HiveUtil.getColumnNames(io.prestosql.plugin.hive.HiveUtil.getColumnNames) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) Type(io.prestosql.spi.type.Type) OutputStream(java.io.OutputStream) PrestoException(io.prestosql.spi.PrestoException) RcFileEncoding(io.prestosql.rcfile.RcFileEncoding) Properties(java.util.Properties) ImmutableMap(com.google.common.collect.ImmutableMap) RCFileOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat) TypeManager(io.prestosql.spi.type.TypeManager) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) IOException(java.io.IOException) BinaryRcFileEncoding(io.prestosql.rcfile.binary.BinaryRcFileEncoding) ColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe) JobConf(org.apache.hadoop.mapred.JobConf) FileOutputFormat(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HdfsRcFileDataSource(io.prestosql.plugin.hive.rcfile.HdfsRcFileDataSource) Optional(java.util.Optional) RcFileDataSource(io.prestosql.rcfile.RcFileDataSource) OutputStream(java.io.OutputStream) PrestoException(io.prestosql.spi.PrestoException) RcFileEncoding(io.prestosql.rcfile.RcFileEncoding) BinaryRcFileEncoding(io.prestosql.rcfile.binary.BinaryRcFileEncoding) IOException(java.io.IOException) PrestoException(io.prestosql.spi.PrestoException) IOException(java.io.IOException) Type(io.prestosql.spi.type.Type) HdfsRcFileDataSource(io.prestosql.plugin.hive.rcfile.HdfsRcFileDataSource) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) Supplier(java.util.function.Supplier) BinaryRcFileEncoding(io.prestosql.rcfile.binary.BinaryRcFileEncoding)

Aggregations

ImmutableMap (com.google.common.collect.ImmutableMap)2 HiveUtil.getColumnNames (io.prestosql.plugin.hive.HiveUtil.getColumnNames)2 HiveUtil.getColumnTypes (io.prestosql.plugin.hive.HiveUtil.getColumnTypes)2 StorageFormat (io.prestosql.plugin.hive.metastore.StorageFormat)2 HdfsRcFileDataSource (io.prestosql.plugin.hive.rcfile.HdfsRcFileDataSource)2 RcFilePageSourceFactory (io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory)2 RcFileDataSource (io.prestosql.rcfile.RcFileDataSource)2 RcFileEncoding (io.prestosql.rcfile.RcFileEncoding)2 BinaryRcFileEncoding (io.prestosql.rcfile.binary.BinaryRcFileEncoding)2 PrestoException (io.prestosql.spi.PrestoException)2 ConnectorSession (io.prestosql.spi.connector.ConnectorSession)2 Type (io.prestosql.spi.type.Type)2 TypeManager (io.prestosql.spi.type.TypeManager)2 IOException (java.io.IOException)2 OutputStream (java.io.OutputStream)2 List (java.util.List)2 Objects.requireNonNull (java.util.Objects.requireNonNull)2 Optional (java.util.Optional)2 Properties (java.util.Properties)2 Callable (java.util.concurrent.Callable)2