Search in sources :

Example 1 with FooterAwareRecordReader

use of com.facebook.presto.hive.util.FooterAwareRecordReader in project presto by prestodb.

the class HiveUtil method createRecordReader.

public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns, Map<String, String> customSplitInfo) {
    // determine which hive columns we will read
    List<HiveColumnHandle> readColumns = ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == REGULAR));
    List<Integer> readHiveColumnIndexes = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex));
    // Tell hive the columns we would like to read, this lets hive optimize reading column oriented files
    setReadColumns(configuration, readHiveColumnIndexes);
    // Only propagate serialization schema configs by default
    Predicate<String> schemaFilter = schemaProperty -> schemaProperty.startsWith("serialization.");
    InputFormat<?, ?> inputFormat = getInputFormat(configuration, getInputFormatName(schema), true);
    JobConf jobConf = toJobConf(configuration);
    FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);
    if (!customSplitInfo.isEmpty() && isHudiRealtimeSplit(customSplitInfo)) {
        fileSplit = recreateSplitWithCustomInfo(fileSplit, customSplitInfo);
        // Add additional column information for record reader
        List<String> readHiveColumnNames = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getName));
        jobConf.set(READ_COLUMN_NAMES_CONF_STR, Joiner.on(',').join(readHiveColumnNames));
        // Remove filter when using customSplitInfo as the record reader requires complete schema configs
        schemaFilter = schemaProperty -> true;
    }
    schema.stringPropertyNames().stream().filter(schemaFilter).forEach(name -> jobConf.set(name, schema.getProperty(name)));
    // add Airlift LZO and LZOP to head of codecs list so as to not override existing entries
    List<String> codecs = newArrayList(Splitter.on(",").trimResults().omitEmptyStrings().split(jobConf.get("io.compression.codecs", "")));
    if (!codecs.contains(LzoCodec.class.getName())) {
        codecs.add(0, LzoCodec.class.getName());
    }
    if (!codecs.contains(LzopCodec.class.getName())) {
        codecs.add(0, LzopCodec.class.getName());
    }
    jobConf.set("io.compression.codecs", codecs.stream().collect(joining(",")));
    try {
        RecordReader<WritableComparable, Writable> recordReader = (RecordReader<WritableComparable, Writable>) inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL);
        int headerCount = getHeaderCount(schema);
        // Only skip header rows when the split is at the beginning of the file
        if (start == 0 && headerCount > 0) {
            Utilities.skipHeader(recordReader, headerCount, recordReader.createKey(), recordReader.createValue());
        }
        int footerCount = getFooterCount(schema);
        if (footerCount > 0) {
            recordReader = new FooterAwareRecordReader<>(recordReader, footerCount, jobConf);
        }
        return recordReader;
    } catch (IOException e) {
        if (e instanceof TextLineLengthLimitExceededException) {
            throw new PrestoException(HIVE_BAD_DATA, "Line too long in text file: " + path, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), firstNonNull(e.getMessage(), e.getClass().getName())), e);
    }
}
Also used : HIVE_TABLE_BUCKETING_IS_IGNORED(com.facebook.presto.hive.HiveErrorCode.HIVE_TABLE_BUCKETING_IS_IGNORED) DateTimeZone(org.joda.time.DateTimeZone) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) DECIMAL_TYPE_NAME(org.apache.hadoop.hive.serde.serdeConstants.DECIMAL_TYPE_NAME) GENERIC_INTERNAL_ERROR(com.facebook.presto.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR) Writable(org.apache.hadoop.io.Writable) Short.parseShort(java.lang.Short.parseShort) TypeSignature(com.facebook.presto.common.type.TypeSignature) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) MAX_PARTITION_KEY_COLUMN_INDEX(com.facebook.presto.hive.HiveColumnHandle.MAX_PARTITION_KEY_COLUMN_INDEX) BigDecimal(java.math.BigDecimal) FileSplit(org.apache.hadoop.mapred.FileSplit) Matcher(java.util.regex.Matcher) Slices(io.airlift.slice.Slices) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Varchars.isVarcharType(com.facebook.presto.common.type.Varchars.isVarcharType) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) BigInteger(java.math.BigInteger) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) HiveColumnHandle.isFileModifiedTimeColumnHandle(com.facebook.presto.hive.HiveColumnHandle.isFileModifiedTimeColumnHandle) Double.parseDouble(java.lang.Double.parseDouble) SERIALIZATION_LIB(org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_LIB) PageInputFormat(com.facebook.presto.hive.pagefile.PageInputFormat) LzoCodec(io.airlift.compress.lzo.LzoCodec) NullableValue(com.facebook.presto.common.predicate.NullableValue) HIVE_INVALID_METADATA(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_METADATA) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HIVE_SERDE_NOT_FOUND(com.facebook.presto.hive.HiveErrorCode.HIVE_SERDE_NOT_FOUND) READ_ALL_COLUMNS(org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_ALL_COLUMNS) HiveColumnHandle.bucketColumnHandle(com.facebook.presto.hive.HiveColumnHandle.bucketColumnHandle) Collectors.joining(java.util.stream.Collectors.joining) InvocationTargetException(java.lang.reflect.InvocationTargetException) TypeUtils.isEnumType(com.facebook.presto.common.type.TypeUtils.isEnumType) UncheckedIOException(java.io.UncheckedIOException) Decimals.isShortDecimal(com.facebook.presto.common.type.Decimals.isShortDecimal) Lists.newArrayList(com.google.common.collect.Lists.newArrayList) FooterAwareRecordReader(com.facebook.presto.hive.util.FooterAwareRecordReader) Predicate(com.google.common.base.Predicate) INTEGER(com.facebook.presto.common.type.IntegerType.INTEGER) DateTimePrinter(org.joda.time.format.DateTimePrinter) RecordReader(org.apache.hadoop.mapred.RecordReader) Iterables.filter(com.google.common.collect.Iterables.filter) Joiner(com.google.common.base.Joiner) StandardTypes(com.facebook.presto.common.type.StandardTypes) DecimalType(com.facebook.presto.common.type.DecimalType) Table(com.facebook.presto.hive.metastore.Table) Slice(io.airlift.slice.Slice) DateTimeFormatterBuilder(org.joda.time.format.DateTimeFormatterBuilder) ByteArrayOutputStream(java.io.ByteArrayOutputStream) TypeSignatureParameter(com.facebook.presto.common.type.TypeSignatureParameter) Chars.isCharType(com.facebook.presto.common.type.Chars.isCharType) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) TINYINT(com.facebook.presto.common.type.TinyintType.TINYINT) WritableComparable(org.apache.hadoop.io.WritableComparable) TIMESTAMP(com.facebook.presto.common.type.TimestampType.TIMESTAMP) DATE(com.facebook.presto.common.type.DateType.DATE) REAL(com.facebook.presto.common.type.RealType.REAL) Float.floatToRawIntBits(java.lang.Float.floatToRawIntBits) Byte.parseByte(java.lang.Byte.parseByte) ConfigurationUtils.toJobConf(com.facebook.presto.hive.util.ConfigurationUtils.toJobConf) COLLECTION_DELIM(org.apache.hadoop.hive.serde.serdeConstants.COLLECTION_DELIM) BOOLEAN(com.facebook.presto.common.type.BooleanType.BOOLEAN) HIVE_BAD_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_BAD_DATA) CharType(com.facebook.presto.common.type.CharType) Nullable(javax.annotation.Nullable) BIGINT(com.facebook.presto.common.type.BigintType.BIGINT) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) Properties(java.util.Properties) HiveColumnHandle.fileSizeColumnHandle(com.facebook.presto.hive.HiveColumnHandle.fileSizeColumnHandle) Reporter(org.apache.hadoop.mapred.Reporter) HiveColumnHandle.pathColumnHandle(com.facebook.presto.hive.HiveColumnHandle.pathColumnHandle) IOException(java.io.IOException) HoodieRealtimeFileSplit(org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit) Field(java.lang.reflect.Field) HiveColumnHandle.isBucketColumnHandle(com.facebook.presto.hive.HiveColumnHandle.isBucketColumnHandle) Chars.trimTrailingSpaces(com.facebook.presto.common.type.Chars.trimTrailingSpaces) SMALLINT(com.facebook.presto.common.type.SmallintType.SMALLINT) HiveColumnHandle.fileModifiedTimeColumnHandle(com.facebook.presto.hive.HiveColumnHandle.fileModifiedTimeColumnHandle) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) CustomSplitConversionUtils.recreateSplitWithCustomInfo(com.facebook.presto.hive.util.CustomSplitConversionUtils.recreateSplitWithCustomInfo) TextLineLengthLimitExceededException(com.facebook.presto.hadoop.TextLineLengthLimitExceededException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) Long.parseLong(java.lang.Long.parseLong) ReflectionUtils(org.apache.hadoop.util.ReflectionUtils) ZstdInputStreamNoFinalizer(com.github.luben.zstd.ZstdInputStreamNoFinalizer) DateTimeParser(org.joda.time.format.DateTimeParser) HIVE_INVALID_VIEW_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_VIEW_DATA) HiveColumnHandle.isFileSizeColumnHandle(com.facebook.presto.hive.HiveColumnHandle.isFileSizeColumnHandle) HoodieParquetRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat) READ_COLUMN_NAMES_CONF_STR(org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR) Float.parseFloat(java.lang.Float.parseFloat) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ByteArrayInputStream(java.io.ByteArrayInputStream) ParquetHiveSerDe(org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe) InputFormat(org.apache.hadoop.mapred.InputFormat) Path(org.apache.hadoop.fs.Path) ConfigurationUtils.copy(com.facebook.presto.hive.util.ConfigurationUtils.copy) Splitter(com.google.common.base.Splitter) Method(java.lang.reflect.Method) SliceUtf8(io.airlift.slice.SliceUtf8) DateTimeFormat(org.joda.time.format.DateTimeFormat) ISODateTimeFormat(org.joda.time.format.ISODateTimeFormat) HIVE_UNSUPPORTED_FORMAT(com.facebook.presto.hive.HiveErrorCode.HIVE_UNSUPPORTED_FORMAT) ZstdOutputStreamNoFinalizer(com.github.luben.zstd.ZstdOutputStreamNoFinalizer) ImmutableMap(com.google.common.collect.ImmutableMap) DOUBLE(com.facebook.presto.common.type.DoubleType.DOUBLE) VarcharType(com.facebook.presto.common.type.VarcharType) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Codec(com.facebook.airlift.json.Codec) Decimals.isLongDecimal(com.facebook.presto.common.type.Decimals.isLongDecimal) String.format(java.lang.String.format) PrestoAvroSerDe(com.facebook.presto.hive.avro.PrestoAvroSerDe) RecordCursor(com.facebook.presto.spi.RecordCursor) Base64(java.util.Base64) List(java.util.List) ColumnMetadata(com.facebook.presto.spi.ColumnMetadata) NOT_SUPPORTED(com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED) Annotation(java.lang.annotation.Annotation) Optional(java.util.Optional) MoreObjects.firstNonNull(com.google.common.base.MoreObjects.firstNonNull) READ_COLUMN_IDS_CONF_STR(org.apache.hadoop.hive.serde2.ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR) Pattern(java.util.regex.Pattern) PARTITION_KEY(com.facebook.presto.hive.HiveColumnHandle.ColumnType.PARTITION_KEY) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) Column(com.facebook.presto.hive.metastore.Column) ROUND_UNNECESSARY(java.math.BigDecimal.ROUND_UNNECESSARY) HIVE_INVALID_PARTITION_VALUE(com.facebook.presto.hive.HiveErrorCode.HIVE_INVALID_PARTITION_VALUE) Lists.transform(com.google.common.collect.Lists.transform) PrestoException(com.facebook.presto.spi.PrestoException) OptionalInt(java.util.OptionalInt) HiveColumnHandle.isPathColumnHandle(com.facebook.presto.hive.HiveColumnHandle.isPathColumnHandle) HIVE_CANNOT_OPEN_SPLIT(com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) LzopCodec(io.airlift.compress.lzo.LzopCodec) SymlinkTextInputFormat(org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) Category(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) ImmutableList(com.google.common.collect.ImmutableList) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) TypeManager(com.facebook.presto.common.type.TypeManager) HIVE_FILE_MISSING_COLUMN_NAMES(com.facebook.presto.hive.HiveErrorCode.HIVE_FILE_MISSING_COLUMN_NAMES) Objects.requireNonNull(java.util.Objects.requireNonNull) OrcType(com.facebook.presto.orc.metadata.OrcType) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) Type(com.facebook.presto.common.type.Type) NamedTypeSignature(com.facebook.presto.common.type.NamedTypeSignature) VerifyException(com.google.common.base.VerifyException) Storage(com.facebook.presto.hive.metastore.Storage) DateTimeFormatter(org.joda.time.format.DateTimeFormatter) UTF_8(java.nio.charset.StandardCharsets.UTF_8) HIVE_DEFAULT_DYNAMIC_PARTITION(com.facebook.presto.hive.metastore.MetastoreUtil.HIVE_DEFAULT_DYNAMIC_PARTITION) Decimals(com.facebook.presto.common.type.Decimals) Integer.parseInt(java.lang.Integer.parseInt) JavaUtils(org.apache.hadoop.hive.common.JavaUtils) JobConf(org.apache.hadoop.mapred.JobConf) TimeUnit(java.util.concurrent.TimeUnit) FILE_INPUT_FORMAT(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.FILE_INPUT_FORMAT) RowFieldName(com.facebook.presto.common.type.RowFieldName) MetastoreUtil.checkCondition(com.facebook.presto.hive.metastore.MetastoreUtil.checkCondition) InputStream(java.io.InputStream) DecimalType.createDecimalType(com.facebook.presto.common.type.DecimalType.createDecimalType) FooterAwareRecordReader(com.facebook.presto.hive.util.FooterAwareRecordReader) RecordReader(org.apache.hadoop.mapred.RecordReader) Writable(org.apache.hadoop.io.Writable) LzoCodec(io.airlift.compress.lzo.LzoCodec) PrestoException(com.facebook.presto.spi.PrestoException) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieRealtimeFileSplit(org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit) BigInteger(java.math.BigInteger) LzopCodec(io.airlift.compress.lzo.LzopCodec) TextLineLengthLimitExceededException(com.facebook.presto.hadoop.TextLineLengthLimitExceededException) WritableComparable(org.apache.hadoop.io.WritableComparable) ConfigurationUtils.toJobConf(com.facebook.presto.hive.util.ConfigurationUtils.toJobConf) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

Codec (com.facebook.airlift.json.Codec)1 NullableValue (com.facebook.presto.common.predicate.NullableValue)1 BIGINT (com.facebook.presto.common.type.BigintType.BIGINT)1 BOOLEAN (com.facebook.presto.common.type.BooleanType.BOOLEAN)1 CharType (com.facebook.presto.common.type.CharType)1 Chars.isCharType (com.facebook.presto.common.type.Chars.isCharType)1 Chars.trimTrailingSpaces (com.facebook.presto.common.type.Chars.trimTrailingSpaces)1 DATE (com.facebook.presto.common.type.DateType.DATE)1 DecimalType (com.facebook.presto.common.type.DecimalType)1 DecimalType.createDecimalType (com.facebook.presto.common.type.DecimalType.createDecimalType)1 Decimals (com.facebook.presto.common.type.Decimals)1 Decimals.isLongDecimal (com.facebook.presto.common.type.Decimals.isLongDecimal)1 Decimals.isShortDecimal (com.facebook.presto.common.type.Decimals.isShortDecimal)1 DOUBLE (com.facebook.presto.common.type.DoubleType.DOUBLE)1 INTEGER (com.facebook.presto.common.type.IntegerType.INTEGER)1 NamedTypeSignature (com.facebook.presto.common.type.NamedTypeSignature)1 REAL (com.facebook.presto.common.type.RealType.REAL)1 RowFieldName (com.facebook.presto.common.type.RowFieldName)1 SMALLINT (com.facebook.presto.common.type.SmallintType.SMALLINT)1 StandardTypes (com.facebook.presto.common.type.StandardTypes)1