use of io.trino.plugin.hive.orc.OrcReaderConfig in project trino by trinodb.
the class TestHiveFileFormats method testOrcOptimizedWriter.
@Test(dataProvider = "validRowAndFileSizePadding")
public void testOrcOptimizedWriter(int rowCount, long fileSizePadding) throws Exception {
HiveSessionProperties hiveSessionProperties = new HiveSessionProperties(new HiveConfig(), new OrcReaderConfig(), new OrcWriterConfig().setValidationPercentage(100.0), new ParquetReaderConfig(), new ParquetWriterConfig());
ConnectorSession session = TestingConnectorSession.builder().setPropertyMetadata(hiveSessionProperties.getSessionProperties()).build();
// A Trino page cannot contain a map with null keys, so a page based writer cannot write null keys
List<TestColumn> testColumns = TEST_COLUMNS.stream().filter(TestHiveFileFormats::withoutNullMapKeyTests).collect(toList());
assertThatFileFormat(ORC).withColumns(testColumns).withRowsCount(rowCount).withSession(session).withFileSizePadding(fileSizePadding).withFileWriterFactory(new OrcFileWriterFactory(HDFS_ENVIRONMENT, TESTING_TYPE_MANAGER, new NodeVersion("test"), STATS, new OrcWriterOptions())).isReadableByRecordCursor(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT)).isReadableByPageSource(new OrcPageSourceFactory(new OrcReaderOptions(), HDFS_ENVIRONMENT, STATS, UTC));
}
use of io.trino.plugin.hive.orc.OrcReaderConfig in project trino by trinodb.
the class TestOrcPageSourceMemoryTracking method testMaxReadBytes.
@Test(dataProvider = "rowCount")
public void testMaxReadBytes(int rowCount) throws Exception {
int maxReadBytes = 1_000;
HiveSessionProperties hiveSessionProperties = new HiveSessionProperties(new HiveConfig(), new OrcReaderConfig().setMaxBlockSize(DataSize.ofBytes(maxReadBytes)), new OrcWriterConfig(), new ParquetReaderConfig(), new ParquetWriterConfig());
ConnectorSession session = TestingConnectorSession.builder().setPropertyMetadata(hiveSessionProperties.getSessionProperties()).build();
FileFormatDataSourceStats stats = new FileFormatDataSourceStats();
// Build a table where every row gets larger, so we can test that the "batchSize" reduces
int numColumns = 5;
int step = 250;
ImmutableList.Builder<TestColumn> columnBuilder = ImmutableList.<TestColumn>builder().add(new TestColumn("p_empty_string", javaStringObjectInspector, () -> "", true));
GrowingTestColumn[] dataColumns = new GrowingTestColumn[numColumns];
for (int i = 0; i < numColumns; i++) {
dataColumns[i] = new GrowingTestColumn("p_string" + "_" + i, javaStringObjectInspector, () -> Long.toHexString(random.nextLong()), false, step * (i + 1));
columnBuilder.add(dataColumns[i]);
}
List<TestColumn> testColumns = columnBuilder.build();
File tempFile = File.createTempFile("trino_test_orc_page_source_max_read_bytes", "orc");
tempFile.delete();
TestPreparer testPreparer = new TestPreparer(tempFile.getAbsolutePath(), testColumns, rowCount, rowCount);
ConnectorPageSource pageSource = testPreparer.newPageSource(stats, session);
try {
int positionCount = 0;
while (true) {
Page page = pageSource.getNextPage();
if (pageSource.isFinished()) {
break;
}
assertNotNull(page);
page = page.getLoadedPage();
positionCount += page.getPositionCount();
// ignore the first MAX_BATCH_SIZE rows given the sizes are set when loading the blocks
if (positionCount > MAX_BATCH_SIZE) {
// either the block is bounded by maxReadBytes or we just load one single large block
// an error margin MAX_BATCH_SIZE / step is needed given the block sizes are increasing
assertTrue(page.getSizeInBytes() < maxReadBytes * (MAX_BATCH_SIZE / step) || 1 == page.getPositionCount());
}
}
// verify the stats are correctly recorded
Distribution distribution = stats.getMaxCombinedBytesPerRow().getAllTime();
assertEquals((int) distribution.getCount(), 1);
// the block is VariableWidthBlock that contains valueIsNull and offsets arrays as overhead
assertEquals((int) distribution.getMax(), Arrays.stream(dataColumns).mapToInt(GrowingTestColumn::getMaxSize).sum() + (Integer.BYTES + Byte.BYTES) * numColumns);
pageSource.close();
} finally {
tempFile.delete();
}
}
use of io.trino.plugin.hive.orc.OrcReaderConfig in project trino by trinodb.
the class TestRubixCaching method setup.
@BeforeClass
public void setup() throws IOException {
cacheStoragePath = getStoragePath("/");
config = new HdfsConfig();
List<PropertyMetadata<?>> hiveSessionProperties = getHiveSessionProperties(new HiveConfig(), new RubixEnabledConfig().setCacheEnabled(true), new OrcReaderConfig()).getSessionProperties();
context = new HdfsContext(TestingConnectorSession.builder().setPropertyMetadata(hiveSessionProperties).build());
nonCachingFileSystem = getNonCachingFileSystem();
}
use of io.trino.plugin.hive.orc.OrcReaderConfig in project trino by trinodb.
the class ParquetTester method assertMaxReadBytes.
void assertMaxReadBytes(List<ObjectInspector> objectInspectors, Iterable<?>[] writeValues, Iterable<?>[] readValues, List<String> columnNames, List<Type> columnTypes, Optional<MessageType> parquetSchema, DataSize maxReadBlockSize) throws Exception {
CompressionCodecName compressionCodecName = UNCOMPRESSED;
HiveSessionProperties hiveSessionProperties = new HiveSessionProperties(new HiveConfig().setHiveStorageFormat(HiveStorageFormat.PARQUET).setUseParquetColumnNames(false), new OrcReaderConfig(), new OrcWriterConfig(), new ParquetReaderConfig().setMaxReadBlockSize(maxReadBlockSize), new ParquetWriterConfig());
ConnectorSession session = TestingConnectorSession.builder().setPropertyMetadata(hiveSessionProperties.getSessionProperties()).build();
try (TempFile tempFile = new TempFile("test", "parquet")) {
JobConf jobConf = new JobConf();
jobConf.setEnum(COMPRESSION, compressionCodecName);
jobConf.setBoolean(ENABLE_DICTIONARY, true);
jobConf.setEnum(WRITER_VERSION, PARQUET_1_0);
writeParquetColumn(jobConf, tempFile.getFile(), compressionCodecName, createTableProperties(columnNames, objectInspectors), getStandardStructObjectInspector(columnNames, objectInspectors), getIterators(writeValues), parquetSchema, false);
Iterator<?>[] expectedValues = getIterators(readValues);
try (ConnectorPageSource pageSource = fileFormat.createFileFormatReader(session, HDFS_ENVIRONMENT, tempFile.getFile(), columnNames, columnTypes)) {
assertPageSource(columnTypes, expectedValues, pageSource, Optional.of(getParquetMaxReadBlockSize(session).toBytes()));
assertFalse(stream(expectedValues).allMatch(Iterator::hasNext));
}
}
}
use of io.trino.plugin.hive.orc.OrcReaderConfig in project trino by trinodb.
the class TestHiveFileFormats method testOrcUseColumnNameLowerCaseConversion.
@Test(dataProvider = "rowCount")
public void testOrcUseColumnNameLowerCaseConversion(int rowCount) throws Exception {
List<TestColumn> testColumnsUpperCase = TEST_COLUMNS.stream().map(testColumn -> new TestColumn(testColumn.getName().toUpperCase(Locale.ENGLISH), testColumn.getObjectInspector(), testColumn.getWriteValue(), testColumn.getExpectedValue(), testColumn.isPartitionKey())).collect(toList());
ConnectorSession session = getHiveSession(new HiveConfig(), new OrcReaderConfig().setUseColumnNames(true));
assertThatFileFormat(ORC).withWriteColumns(testColumnsUpperCase).withRowsCount(rowCount).withReadColumns(TEST_COLUMNS).withSession(session).isReadableByPageSource(new OrcPageSourceFactory(new OrcReaderOptions(), HDFS_ENVIRONMENT, STATS, UTC));
}
Aggregations