Search in sources :

Example 1 with RcFileEncoding

use of io.prestosql.rcfile.RcFileEncoding in project hetu-core by openlookeng.

the class RcFileFileWriterFactory method createFileWriter.

@Override
public Optional<HiveFileWriter> createFileWriter(Path path, List<String> inputColumnNames, StorageFormat storageFormat, Properties schema, JobConf configuration, ConnectorSession session, Optional<AcidOutputFormat.Options> acidOptions, Optional<HiveACIDWriteType> acidWriteType) {
    if (!RCFileOutputFormat.class.getName().equals(storageFormat.getOutputFormat())) {
        return Optional.empty();
    }
    RcFileEncoding rcFileEncoding;
    if (LazyBinaryColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) {
        rcFileEncoding = new BinaryRcFileEncoding(timeZone);
    } else if (ColumnarSerDe.class.getName().equals(storageFormat.getSerDe())) {
        rcFileEncoding = RcFilePageSourceFactory.createTextVectorEncoding(schema);
    } else {
        return Optional.empty();
    }
    Optional<String> codecName = Optional.ofNullable(configuration.get(FileOutputFormat.COMPRESS_CODEC));
    // existing tables and partitions may have columns in a different order than the writer is providing, so build
    // an index to rearrange columns in the proper order
    List<String> fileColumnNames = getColumnNames(schema);
    List<Type> fileColumnTypes = getColumnTypes(schema).stream().map(hiveType -> hiveType.getType(typeManager)).collect(toList());
    int[] fileInputColumnIndexes = fileColumnNames.stream().mapToInt(inputColumnNames::indexOf).toArray();
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
        OutputStream outputStream = fileSystem.create(path);
        Optional<Supplier<RcFileDataSource>> validationInputFactory = Optional.empty();
        if (HiveSessionProperties.isRcfileOptimizedWriterValidate(session)) {
            validationInputFactory = Optional.of(() -> {
                try {
                    return new HdfsRcFileDataSource(path.toString(), fileSystem.open(path), fileSystem.getFileStatus(path).getLen(), stats);
                } catch (IOException e) {
                    throw new PrestoException(HiveErrorCode.HIVE_WRITE_VALIDATION_FAILED, e);
                }
            });
        }
        Callable<Void> rollbackAction = () -> {
            fileSystem.delete(path, false);
            return null;
        };
        return Optional.of(new RcFileFileWriter(outputStream, rollbackAction, rcFileEncoding, fileColumnTypes, codecName, fileInputColumnIndexes, ImmutableMap.<String, String>builder().put(HiveMetadata.PRESTO_VERSION_NAME, nodeVersion.toString()).put(HiveMetadata.PRESTO_QUERY_ID_NAME, session.getQueryId()).build(), validationInputFactory));
    } catch (Exception e) {
        throw new PrestoException(HiveErrorCode.HIVE_WRITER_OPEN_ERROR, "Error creating RCFile file", e);
    }
}
Also used : DateTimeZone(org.joda.time.DateTimeZone) StorageFormat(io.prestosql.plugin.hive.metastore.StorageFormat) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) HiveUtil.getColumnTypes(io.prestosql.plugin.hive.HiveUtil.getColumnTypes) Callable(java.util.concurrent.Callable) RcFilePageSourceFactory(io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory) Supplier(java.util.function.Supplier) Inject(javax.inject.Inject) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) HiveUtil.getColumnNames(io.prestosql.plugin.hive.HiveUtil.getColumnNames) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) Type(io.prestosql.spi.type.Type) OutputStream(java.io.OutputStream) PrestoException(io.prestosql.spi.PrestoException) RcFileEncoding(io.prestosql.rcfile.RcFileEncoding) Properties(java.util.Properties) ImmutableMap(com.google.common.collect.ImmutableMap) RCFileOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat) TypeManager(io.prestosql.spi.type.TypeManager) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) IOException(java.io.IOException) BinaryRcFileEncoding(io.prestosql.rcfile.binary.BinaryRcFileEncoding) ColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe) JobConf(org.apache.hadoop.mapred.JobConf) FileOutputFormat(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HdfsRcFileDataSource(io.prestosql.plugin.hive.rcfile.HdfsRcFileDataSource) Optional(java.util.Optional) RcFileDataSource(io.prestosql.rcfile.RcFileDataSource) OutputStream(java.io.OutputStream) PrestoException(io.prestosql.spi.PrestoException) RcFileEncoding(io.prestosql.rcfile.RcFileEncoding) BinaryRcFileEncoding(io.prestosql.rcfile.binary.BinaryRcFileEncoding) IOException(java.io.IOException) PrestoException(io.prestosql.spi.PrestoException) IOException(java.io.IOException) Type(io.prestosql.spi.type.Type) HdfsRcFileDataSource(io.prestosql.plugin.hive.rcfile.HdfsRcFileDataSource) FileSystem(org.apache.hadoop.fs.FileSystem) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) Supplier(java.util.function.Supplier) BinaryRcFileEncoding(io.prestosql.rcfile.binary.BinaryRcFileEncoding)

Example 2 with RcFileEncoding

use of io.prestosql.rcfile.RcFileEncoding in project hetu-core by openlookeng.

the class RcFilePageSourceFactory method createPageSource.

@Override
public Optional<? extends ConnectorPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, long fileSize, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, Optional<DynamicFilterSupplier> dynamicFilters, Optional<DeleteDeltaLocations> deleteDeltaLocations, Optional<Long> startRowOffsetOfFile, Optional<List<IndexMetadata>> indexes, SplitMetadata splitMetadata, boolean splitCacheable, long dataSourceLastModifiedTime) {
    RcFileEncoding rcFileEncoding;
    String deserializerClassName = getDeserializerClassName(schema);
    if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) {
        rcFileEncoding = new BinaryRcFileEncoding(timeZone);
    } else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) {
        rcFileEncoding = createTextVectorEncoding(schema);
    } else {
        return Optional.empty();
    }
    checkArgument(!deleteDeltaLocations.isPresent(), "Delete delta is not supported");
    if (fileSize == 0) {
        throw new PrestoException(HIVE_BAD_DATA, "RCFile is empty: " + path);
    }
    FSDataInputStream inputStream;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getUser(), path, configuration);
        inputStream = hdfsEnvironment.doAs(session.getUser(), () -> fileSystem.open(path));
    } catch (Exception e) {
        if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
            throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
    }
    try {
        ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder();
        for (HiveColumnHandle column : columns) {
            readColumns.put(column.getHiveColumnIndex(), column.getHiveType().getType(typeManager));
        }
        RcFileReader rcFileReader = new RcFileReader(new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats), rcFileEncoding, readColumns.build(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, new DataSize(8, Unit.MEGABYTE));
        return Optional.of(new RcFilePageSource(rcFileReader, columns, typeManager));
    } catch (Throwable e) {
        try {
            inputStream.close();
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        String message = splitError(e, path, start, length);
        if (e instanceof RcFileCorruptionException) {
            throw new PrestoException(HIVE_BAD_DATA, message, e);
        }
        if (e instanceof BlockMissingException) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) PrestoException(io.prestosql.spi.PrestoException) RcFileEncoding(io.prestosql.rcfile.RcFileEncoding) TextRcFileEncoding(io.prestosql.rcfile.text.TextRcFileEncoding) BinaryRcFileEncoding(io.prestosql.rcfile.binary.BinaryRcFileEncoding) IOException(java.io.IOException) RcFileReader(io.prestosql.rcfile.RcFileReader) RcFileCorruptionException(io.prestosql.rcfile.RcFileCorruptionException) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) PrestoException(io.prestosql.spi.PrestoException) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) ImmutableMap(com.google.common.collect.ImmutableMap) Type(io.prestosql.spi.type.Type) RcFileCorruptionException(io.prestosql.rcfile.RcFileCorruptionException) HadoopCodecFactory(io.prestosql.rcfile.HadoopCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) DataSize(io.airlift.units.DataSize) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) LazyBinaryColumnarSerDe(org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe) BlockMissingException(org.apache.hadoop.hdfs.BlockMissingException) BinaryRcFileEncoding(io.prestosql.rcfile.binary.BinaryRcFileEncoding) AircompressorCodecFactory(io.prestosql.rcfile.AircompressorCodecFactory) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle)

Aggregations

ImmutableMap (com.google.common.collect.ImmutableMap)2 RcFileEncoding (io.prestosql.rcfile.RcFileEncoding)2 BinaryRcFileEncoding (io.prestosql.rcfile.binary.BinaryRcFileEncoding)2 PrestoException (io.prestosql.spi.PrestoException)2 Type (io.prestosql.spi.type.Type)2 IOException (java.io.IOException)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 LazyBinaryColumnarSerDe (org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe)2 DataSize (io.airlift.units.DataSize)1 HiveColumnHandle (io.prestosql.plugin.hive.HiveColumnHandle)1 HiveUtil.getColumnNames (io.prestosql.plugin.hive.HiveUtil.getColumnNames)1 HiveUtil.getColumnTypes (io.prestosql.plugin.hive.HiveUtil.getColumnTypes)1 StorageFormat (io.prestosql.plugin.hive.metastore.StorageFormat)1 HdfsRcFileDataSource (io.prestosql.plugin.hive.rcfile.HdfsRcFileDataSource)1 RcFilePageSourceFactory (io.prestosql.plugin.hive.rcfile.RcFilePageSourceFactory)1 AircompressorCodecFactory (io.prestosql.rcfile.AircompressorCodecFactory)1 HadoopCodecFactory (io.prestosql.rcfile.HadoopCodecFactory)1 RcFileCorruptionException (io.prestosql.rcfile.RcFileCorruptionException)1 RcFileDataSource (io.prestosql.rcfile.RcFileDataSource)1 RcFileReader (io.prestosql.rcfile.RcFileReader)1