use of io.trino.plugin.hive.HiveTimestampPrecision in project trino by trinodb.
the class RcFilePageSourceFactory method createPageSource.
@Override
public Optional<ReaderPageSource> createPageSource(Configuration configuration, ConnectorSession session, Path path, long start, long length, long estimatedFileSize, Properties schema, List<HiveColumnHandle> columns, TupleDomain<HiveColumnHandle> effectivePredicate, Optional<AcidInfo> acidInfo, OptionalInt bucketNumber, boolean originalFile, AcidTransaction transaction) {
RcFileEncoding rcFileEncoding;
String deserializerClassName = getDeserializerClassName(schema);
if (deserializerClassName.equals(LazyBinaryColumnarSerDe.class.getName())) {
rcFileEncoding = new BinaryRcFileEncoding(timeZone);
} else if (deserializerClassName.equals(ColumnarSerDe.class.getName())) {
rcFileEncoding = createTextVectorEncoding(schema);
} else {
return Optional.empty();
}
checkArgument(acidInfo.isEmpty(), "Acid is not supported");
List<HiveColumnHandle> projectedReaderColumns = columns;
Optional<ReaderColumns> readerProjections = projectBaseColumns(columns);
if (readerProjections.isPresent()) {
projectedReaderColumns = readerProjections.get().get().stream().map(HiveColumnHandle.class::cast).collect(toImmutableList());
}
RcFileDataSource dataSource;
try {
FileSystem fileSystem = hdfsEnvironment.getFileSystem(session.getIdentity(), path, configuration);
FSDataInputStream inputStream = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.open(path));
if (estimatedFileSize < BUFFER_SIZE.toBytes()) {
// Handle potentially imprecise file lengths by reading the footer
try {
FSDataInputStreamTail fileTail = FSDataInputStreamTail.readTail(path.toString(), estimatedFileSize, inputStream, toIntExact(BUFFER_SIZE.toBytes()));
dataSource = new MemoryRcFileDataSource(new RcFileDataSourceId(path.toString()), fileTail.getTailSlice());
} finally {
inputStream.close();
}
} else {
long fileSize = hdfsEnvironment.doAs(session.getIdentity(), () -> fileSystem.getFileStatus(path).getLen());
dataSource = new HdfsRcFileDataSource(path.toString(), inputStream, fileSize, stats);
}
} catch (Exception e) {
if (nullToEmpty(e.getMessage()).trim().equals("Filesystem closed") || e instanceof FileNotFoundException) {
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, splitError(e, path, start, length), e);
}
length = min(dataSource.getSize() - start, length);
// Split may be empty now that the correct file size is known
if (length <= 0) {
return Optional.of(noProjectionAdaptation(new EmptyPageSource()));
}
try {
ImmutableMap.Builder<Integer, Type> readColumns = ImmutableMap.builder();
HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session);
for (HiveColumnHandle column : projectedReaderColumns) {
readColumns.put(column.getBaseHiveColumnIndex(), column.getHiveType().getType(typeManager, timestampPrecision));
}
RcFileReader rcFileReader = new RcFileReader(dataSource, rcFileEncoding, readColumns.buildOrThrow(), new AircompressorCodecFactory(new HadoopCodecFactory(configuration.getClassLoader())), start, length, BUFFER_SIZE);
ConnectorPageSource pageSource = new RcFilePageSource(rcFileReader, projectedReaderColumns);
return Optional.of(new ReaderPageSource(pageSource, readerProjections));
} catch (Throwable e) {
try {
dataSource.close();
} catch (IOException ignored) {
}
if (e instanceof TrinoException) {
throw (TrinoException) e;
}
String message = splitError(e, path, start, length);
if (e instanceof RcFileCorruptionException) {
throw new TrinoException(HIVE_BAD_DATA, message, e);
}
if (e instanceof BlockMissingException) {
throw new TrinoException(HIVE_MISSING_DATA, message, e);
}
throw new TrinoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
}
}
use of io.trino.plugin.hive.HiveTimestampPrecision in project trino by trinodb.
the class HiveTypeTranslator method toTypeSignature.
public static TypeSignature toTypeSignature(TypeInfo typeInfo, HiveTimestampPrecision timestampPrecision) {
switch(typeInfo.getCategory()) {
case PRIMITIVE:
Type primitiveType = fromPrimitiveType((PrimitiveTypeInfo) typeInfo, timestampPrecision);
if (primitiveType == null) {
break;
}
return primitiveType.getTypeSignature();
case MAP:
MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo;
return mapType(toTypeSignature(mapTypeInfo.getMapKeyTypeInfo(), timestampPrecision), toTypeSignature(mapTypeInfo.getMapValueTypeInfo(), timestampPrecision));
case LIST:
ListTypeInfo listTypeInfo = (ListTypeInfo) typeInfo;
TypeSignature elementType = toTypeSignature(listTypeInfo.getListElementTypeInfo(), timestampPrecision);
return arrayType(typeParameter(elementType));
case STRUCT:
StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
List<String> fieldNames = structTypeInfo.getAllStructFieldNames();
if (fieldTypes.size() != fieldNames.size()) {
throw new TrinoException(HiveErrorCode.HIVE_INVALID_METADATA, format("Invalid Hive struct type: %s", typeInfo));
}
return rowType(Streams.zip(// TODO: This is a hack. Trino engine should be able to handle identifiers in a case insensitive way where necessary.
fieldNames.stream().map(s -> s.toLowerCase(Locale.US)), fieldTypes.stream().map(type -> toTypeSignature(type, timestampPrecision)), TypeSignatureParameter::namedField).collect(Collectors.toList()));
case UNION:
// Use a row type to represent a union type in Hive for reading
UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo;
List<TypeInfo> unionObjectTypes = unionTypeInfo.getAllUnionObjectTypeInfos();
ImmutableList.Builder<TypeSignatureParameter> typeSignatures = ImmutableList.builder();
typeSignatures.add(namedField("tag", TINYINT.getTypeSignature()));
for (int i = 0; i < unionObjectTypes.size(); i++) {
TypeInfo unionObjectType = unionObjectTypes.get(i);
typeSignatures.add(namedField("field" + i, toTypeSignature(unionObjectType, timestampPrecision)));
}
return rowType(typeSignatures.build());
}
throw new TrinoException(NOT_SUPPORTED, format("Unsupported Hive type: %s", typeInfo));
}
use of io.trino.plugin.hive.HiveTimestampPrecision in project trino by trinodb.
the class HiveBucketing method getHiveBucketHandle.
public static Optional<HiveBucketHandle> getHiveBucketHandle(ConnectorSession session, Table table, TypeManager typeManager) {
if (table.getParameters().containsKey(SPARK_TABLE_PROVIDER_KEY)) {
return Optional.empty();
}
Optional<HiveBucketProperty> hiveBucketProperty = table.getStorage().getBucketProperty();
if (hiveBucketProperty.isEmpty()) {
return Optional.empty();
}
if (!isSupportedBucketing(table)) {
return Optional.empty();
}
HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session);
Map<String, HiveColumnHandle> map = getRegularColumnHandles(table, typeManager, timestampPrecision).stream().collect(Collectors.toMap(HiveColumnHandle::getName, identity()));
ImmutableList.Builder<HiveColumnHandle> bucketColumns = ImmutableList.builder();
for (String bucketColumnName : hiveBucketProperty.get().getBucketedBy()) {
HiveColumnHandle bucketColumnHandle = map.get(bucketColumnName);
if (bucketColumnHandle == null) {
throw new TrinoException(HIVE_INVALID_METADATA, format("Table '%s.%s' is bucketed on non-existent column '%s'", table.getDatabaseName(), table.getTableName(), bucketColumnName));
}
bucketColumns.add(bucketColumnHandle);
}
BucketingVersion bucketingVersion = hiveBucketProperty.get().getBucketingVersion();
int bucketCount = hiveBucketProperty.get().getBucketCount();
List<SortingColumn> sortedBy = hiveBucketProperty.get().getSortedBy();
return Optional.of(new HiveBucketHandle(bucketColumns.build(), bucketingVersion, bucketCount, bucketCount, sortedBy));
}
use of io.trino.plugin.hive.HiveTimestampPrecision in project trino by trinodb.
the class TestHiveCompatibility method testTimestampFieldWrittenByOptimizedParquetWriterCannotBeReadByHive.
@Test(groups = STORAGE_FORMATS_DETAILED)
public void testTimestampFieldWrittenByOptimizedParquetWriterCannotBeReadByHive() throws Exception {
// only admin user is allowed to change session properties
setAdminRole(onTrino().getConnection());
setSessionProperty(onTrino().getConnection(), "hive.experimental_parquet_optimized_writer_enabled", "true");
String tableName = "parquet_table_timestamp_created_in_trino";
onTrino().executeQuery("DROP TABLE IF EXISTS " + tableName);
onTrino().executeQuery("CREATE TABLE " + tableName + "(timestamp_precision varchar, a_timestamp timestamp) WITH (format = 'PARQUET')");
for (HiveTimestampPrecision hiveTimestampPrecision : HiveTimestampPrecision.values()) {
setSessionProperty(onTrino().getConnection(), "hive.timestamp_precision", hiveTimestampPrecision.name());
onTrino().executeQuery("INSERT INTO " + tableName + " VALUES ('" + hiveTimestampPrecision.name() + "', TIMESTAMP '2021-01-05 12:01:00.111901001')");
// Hive expects `INT96` (deprecated on Parquet) for timestamp values
assertQueryFailure(() -> onHive().executeQuery("SELECT a_timestamp FROM " + tableName + " WHERE timestamp_precision = '" + hiveTimestampPrecision.name() + "'")).hasMessageMatching(".*java.lang.ClassCastException: org.apache.hadoop.io.LongWritable cannot be cast to org.apache.hadoop.hive.serde2.io.(TimestampWritable|TimestampWritableV2)");
}
onTrino().executeQuery(format("DROP TABLE %s", tableName));
}
use of io.trino.plugin.hive.HiveTimestampPrecision in project trino by trinodb.
the class TestHiveStorageFormats method assertStructTimestamps.
/**
* Assertions for tables created by {@link #createStructTimestampTable(String, StorageFormat)}
*/
private void assertStructTimestamps(String tableName, Collection<TimestampAndPrecision> data) {
SoftAssertions softly = new SoftAssertions();
for (HiveTimestampPrecision precision : HiveTimestampPrecision.values()) {
setTimestampPrecision(precision);
// Check that the correct types are read
String type = format("timestamp(%d)", precision.getPrecision());
softly.check(() -> assertThat(onTrino().executeQuery(format("SELECT" + " typeof(arr)," + " typeof(map)," + " typeof(row)," + " typeof(nested)" + " FROM %s" + " LIMIT 1", tableName))).as("timestamp container types").containsOnly(row(format("array(%s)", type), format("map(%1$s, %1$s)", type), format("row(col %s)", type), format("array(map(%1$s, row(col array(%1$s))))", type))));
// Check the values as varchar
softly.check(() -> assertThat(onTrino().executeQuery(format("SELECT" + " id," + " CAST(arr[1] AS VARCHAR)," + // key
" CAST(map_entries(map)[1][1] AS VARCHAR)," + // value
" CAST(map_entries(map)[1][2] AS VARCHAR)," + " CAST(row.col AS VARCHAR)," + // key
" CAST(map_entries(nested[1])[1][1] AS VARCHAR)," + // value
" CAST(map_entries(nested[1])[1][2].col[1] AS VARCHAR)" + " FROM %s" + " ORDER BY id", tableName))).as("timestamp containers as varchar").containsExactlyInOrder(data.stream().sorted(comparingInt(TimestampAndPrecision::getId)).map(e -> new Row(Lists.asList(e.getId(), nCopies(6, e.getReadValue(precision)).toArray()))).collect(toList())));
// Check the values directly
softly.check(() -> assertThat(onTrino().executeQuery(format("SELECT" + " id," + " arr[1]," + // key
" map_entries(map)[1][1]," + // value
" map_entries(map)[1][2]," + " row.col," + // key
" map_entries(nested[1])[1][1]," + // value
" map_entries(nested[1])[1][2].col[1]" + " FROM %s" + " ORDER BY id", tableName))).as("timestamp containers").containsExactlyInOrder(data.stream().sorted(comparingInt(TimestampAndPrecision::getId)).map(e -> new Row(Lists.asList(e.getId(), nCopies(6, Timestamp.valueOf(e.getReadValue(precision))).toArray()))).collect(toList())));
}
softly.assertAll();
}
Aggregations