use of io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA in project trino by trinodb.
the class OrcPageSourceFactory method verifyAcidSchema.
static void verifyAcidSchema(OrcReader orcReader, Path path) {
OrcColumn rootColumn = orcReader.getRootColumn();
List<OrcColumn> nestedColumns = rootColumn.getNestedColumns();
if (nestedColumns.size() != 6) {
throw new TrinoException(HIVE_BAD_DATA, format("ORC ACID file should have 6 columns, found %s %s in %s", nestedColumns.size(), nestedColumns.stream().map(column -> format("%s (%s)", column.getColumnName(), column.getColumnType())).collect(toImmutableList()), path));
}
verifyAcidColumn(orcReader, 0, AcidSchema.ACID_COLUMN_OPERATION, INT, path);
verifyAcidColumn(orcReader, 1, AcidSchema.ACID_COLUMN_ORIGINAL_TRANSACTION, LONG, path);
verifyAcidColumn(orcReader, 2, AcidSchema.ACID_COLUMN_BUCKET, INT, path);
verifyAcidColumn(orcReader, 3, AcidSchema.ACID_COLUMN_ROW_ID, LONG, path);
verifyAcidColumn(orcReader, 4, AcidSchema.ACID_COLUMN_CURRENT_TRANSACTION, LONG, path);
verifyAcidColumn(orcReader, 5, AcidSchema.ACID_COLUMN_ROW_STRUCT, STRUCT, path);
}
use of io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA in project trino by trinodb.
the class ParquetPageSourceFactory method createPageSource.
/**
* This method is available for other callers to use directly.
*/
public static ReaderPageSource createPageSource(Path path, long start, long length, long estimatedFileSize, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, boolean useColumnNames, HdfsEnvironment hdfsEnvironment, Configuration configuration, ConnectorIdentity identity, DateTimeZone timeZone, FileFormatDataSourceStats stats, ParquetReaderOptions options) {
// Ignore predicates on partial columns for now.
effectivePredicate = effectivePredicate.filter((column, domain) -> column.isBaseColumn());
MessageType fileSchema;
MessageType requestedSchema;
MessageColumnIO messageColumn;
ParquetReader parquetReader;
ParquetDataSource dataSource = null;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(identity, path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(identity, () -> fileSystem.open(path));
dataSource = new HdfsParquetDataSource(new ParquetDataSourceId(path.toString()), estimatedFileSize, inputStream, stats, options);
ParquetMetadata parquetMetadata = MetadataReader.readFooter(dataSource);
FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
fileSchema = fileMetaData.getSchema();
Optional<MessageType> message = projectSufficientColumns(columns).map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns).stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getColumnType(column, fileSchema, useColumnNames)).filter(Optional::isPresent).map(Optional::get).map(type -> new MessageType(fileSchema.getName(), type)).reduce(MessageType::union);
requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
messageColumn = getColumnIO(fileSchema, requestedSchema);
Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, requestedSchema);
TupleDomain<ColumnDescriptor> parquetTupleDomain = options.isIgnoreStatistics() ? TupleDomain.all() : getParquetTupleDomain(descriptorsByPath, effectivePredicate, fileSchema, useColumnNames);
Predicate parquetPredicate = buildPredicate(requestedSchema, parquetTupleDomain, descriptorsByPath, timeZone);
long nextStart = 0;
ImmutableList.Builder<BlockMetaData> blocks = ImmutableList.builder();
ImmutableList.Builder<Long> blockStarts = ImmutableList.builder();
ImmutableList.Builder<Optional<ColumnIndexStore>> columnIndexes = ImmutableList.builder();
for (BlockMetaData block : parquetMetadata.getBlocks()) {
long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
Optional<ColumnIndexStore> columnIndex = getColumnIndexStore(dataSource, block, descriptorsByPath, parquetTupleDomain, options);
if (start <= firstDataPage && firstDataPage < start + length && predicateMatches(parquetPredicate, block, dataSource, descriptorsByPath, parquetTupleDomain, columnIndex)) {
blocks.add(block);
blockStarts.add(nextStart);
columnIndexes.add(columnIndex);
}
nextStart += block.getRowCount();
}
parquetReader = new ParquetReader(Optional.ofNullable(fileMetaData.getCreatedBy()), messageColumn, blocks.build(), Optional.of(blockStarts.build()), dataSource, timeZone, newSimpleAggregatedMemoryContext(), options, parquetPredicate, columnIndexes.build());
} catch (Exception e) {
try {
if (dataSource != null) {
dataSource.close();
}
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
if (e instanceof ParquetCorruptionException) {
throw new TrinoException(HIVE_BAD_DATA, e);
}
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
List<HiveColumnHandle> baseColumns = readerProjections.map(projection -> projection.get().stream().map(HiveColumnHandle.class::cast).collect(toUnmodifiableList())).orElse(columns);
for (HiveColumnHandle column : baseColumns) {
checkArgument(column == PARQUET_ROW_INDEX_COLUMN || column.getColumnType() == REGULAR, "column type must be REGULAR: %s", column);
}
ImmutableList.Builder<Type> trinoTypes = ImmutableList.builder();
ImmutableList.Builder<Optional<Field>> internalFields = ImmutableList.builder();
ImmutableList.Builder<Boolean> rowIndexColumns = ImmutableList.builder();
for (HiveColumnHandle column : baseColumns) {
trinoTypes.add(column.getBaseType());
rowIndexColumns.add(column == PARQUET_ROW_INDEX_COLUMN);
if (column == PARQUET_ROW_INDEX_COLUMN) {
internalFields.add(Optional.empty());
} else {
internalFields.add(Optional.ofNullable(getParquetType(column, fileSchema, useColumnNames)).flatMap(field -> {
String columnName = useColumnNames ? column.getBaseColumnName() : fileSchema.getFields().get(column.getBaseHiveColumnIndex()).getName();
return constructField(column.getBaseType(), lookupColumnByName(messageColumn, columnName));
}));
}
}
ConnectorPageSource parquetPageSource = new ParquetPageSource(parquetReader, trinoTypes.build(), rowIndexColumns.build(), internalFields.build());
return new ReaderPageSource(parquetPageSource, readerProjections);
}
use of io.trino.plugin.hive.HiveErrorCode.HIVE_BAD_DATA in project trino by trinodb.
the class HiveUtil method createRecordReader.
public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns) {
// determine which hive columns we will read
List<HiveColumnHandle> readColumns = columns.stream().filter(column -> column.getColumnType() == REGULAR).collect(toImmutableList());
// Projected columns are not supported here
readColumns.forEach(readColumn -> checkArgument(readColumn.isBaseColumn(), "column %s is not a base column", readColumn.getName()));
List<Integer> readHiveColumnIndexes = readColumns.stream().map(HiveColumnHandle::getBaseHiveColumnIndex).collect(toImmutableList());
// Tell hive the columns we would like to read, this lets hive optimize reading column oriented files
configuration = copy(configuration);
setReadColumns(configuration, readHiveColumnIndexes);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, true);
JobConf jobConf = toJobConf(configuration);
FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);
// propagate serialization configuration to getRecordReader
schema.stringPropertyNames().stream().filter(name -> name.startsWith("serialization.")).forEach(name -> jobConf.set(name, schema.getProperty(name)));
configureCompressionCodecs(jobConf);
try {
// raw type on WritableComparable can't be fixed because Utilities#skipHeader takes them raw
@SuppressWarnings({ "rawtypes", "unchecked" }) RecordReader<WritableComparable, Writable> recordReader = (RecordReader<WritableComparable, Writable>) inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL);
int headerCount = getHeaderCount(schema);
// Only skip header rows when the split is at the beginning of the file
if (start == 0 && headerCount > 0) {
Utilities.skipHeader(recordReader, headerCount, recordReader.createKey(), recordReader.createValue());
}
int footerCount = getFooterCount(schema);
if (footerCount > 0) {
recordReader = new FooterAwareRecordReader<>(recordReader, footerCount, jobConf);
}
return recordReader;
} catch (IOException e) {
if (e instanceof TextLineLengthLimitExceededException) {
throw new TrinoException(HIVE_BAD_DATA, "Line too long in text file: " + path, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), firstNonNull(e.getMessage(), e.getClass().getName())), e);
}
}
Aggregations