use of io.trino.spi.type.DoubleType.DOUBLE in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method calculateNullsFraction.
@VisibleForTesting
static Estimate calculateNullsFraction(String column, Collection<PartitionStatistics> partitionStatistics) {
List<PartitionStatistics> statisticsWithKnownRowCountAndNullsCount = partitionStatistics.stream().filter(statistics -> {
if (statistics.getBasicStatistics().getRowCount().isEmpty()) {
return false;
}
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
if (columnStatistics == null) {
return false;
}
return columnStatistics.getNullsCount().isPresent();
}).collect(toImmutableList());
if (statisticsWithKnownRowCountAndNullsCount.isEmpty()) {
return Estimate.unknown();
}
long totalNullsCount = 0;
long totalRowCount = 0;
for (PartitionStatistics statistics : statisticsWithKnownRowCountAndNullsCount) {
long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
verifyNotNull(columnStatistics, "columnStatistics is null");
long nullsCount = columnStatistics.getNullsCount().orElseThrow(() -> new VerifyException("nullsCount is not present"));
verify(nullsCount >= 0, "nullsCount must be greater than or equal to zero");
verify(nullsCount <= rowCount, "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", nullsCount, rowCount);
totalNullsCount += nullsCount;
totalRowCount += rowCount;
}
if (totalRowCount == 0) {
return Estimate.zero();
}
verify(totalNullsCount <= totalRowCount, "totalNullsCount must be less than or equal to totalRowCount. totalNullsCount: %s. totalRowCount: %s.", totalNullsCount, totalRowCount);
return Estimate.of(((double) totalNullsCount) / totalRowCount);
}
use of io.trino.spi.type.DoubleType.DOUBLE in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method calculateRangeForPartitioningKey.
@VisibleForTesting
static Optional<DoubleRange> calculateRangeForPartitioningKey(HiveColumnHandle column, Type type, List<HivePartition> partitions) {
List<OptionalDouble> convertedValues = partitions.stream().map(HivePartition::getKeys).map(keys -> keys.get(column)).filter(value -> !value.isNull()).map(NullableValue::getValue).map(value -> convertPartitionValueToDouble(type, value)).collect(toImmutableList());
if (convertedValues.stream().noneMatch(OptionalDouble::isPresent)) {
return Optional.empty();
}
double[] values = convertedValues.stream().peek(convertedValue -> checkState(convertedValue.isPresent(), "convertedValue is missing")).mapToDouble(OptionalDouble::getAsDouble).toArray();
verify(values.length != 0, "No values");
if (DoubleStream.of(values).anyMatch(Double::isNaN)) {
return Optional.empty();
}
double min = DoubleStream.of(values).min().orElseThrow();
double max = DoubleStream.of(values).max().orElseThrow();
return Optional.of(new DoubleRange(min, max));
}
use of io.trino.spi.type.DoubleType.DOUBLE in project trino by trinodb.
the class TestHivePageSink method writeTestFile.
private static long writeTestFile(HiveConfig config, HiveMetastore metastore, String outputPath) {
HiveTransactionHandle transaction = new HiveTransactionHandle(false);
HiveWriterStats stats = new HiveWriterStats();
ConnectorPageSink pageSink = createPageSink(transaction, config, metastore, new Path("file:///" + outputPath), stats);
List<LineItemColumn> columns = getTestColumns();
List<Type> columnTypes = columns.stream().map(LineItemColumn::getType).map(TestHivePageSink::getHiveType).map(hiveType -> hiveType.getType(TESTING_TYPE_MANAGER)).collect(toList());
PageBuilder pageBuilder = new PageBuilder(columnTypes);
int rows = 0;
for (LineItem lineItem : new LineItemGenerator(0.01, 1, 1)) {
rows++;
if (rows >= NUM_ROWS) {
break;
}
pageBuilder.declarePosition();
for (int i = 0; i < columns.size(); i++) {
LineItemColumn column = columns.get(i);
BlockBuilder blockBuilder = pageBuilder.getBlockBuilder(i);
switch(column.getType().getBase()) {
case IDENTIFIER:
BIGINT.writeLong(blockBuilder, column.getIdentifier(lineItem));
break;
case INTEGER:
INTEGER.writeLong(blockBuilder, column.getInteger(lineItem));
break;
case DATE:
DATE.writeLong(blockBuilder, column.getDate(lineItem));
break;
case DOUBLE:
DOUBLE.writeDouble(blockBuilder, column.getDouble(lineItem));
break;
case VARCHAR:
createUnboundedVarcharType().writeSlice(blockBuilder, Slices.utf8Slice(column.getString(lineItem)));
break;
default:
throw new IllegalArgumentException("Unsupported type " + column.getType());
}
}
}
Page page = pageBuilder.build();
pageSink.appendPage(page);
getFutureValue(pageSink.finish());
File outputDir = new File(outputPath);
List<File> files = ImmutableList.copyOf(outputDir.listFiles((dir, name) -> !name.endsWith(".crc")));
File outputFile = getOnlyElement(files);
long length = outputFile.length();
ConnectorPageSource pageSource = createPageSource(transaction, config, outputFile);
List<Page> pages = new ArrayList<>();
while (!pageSource.isFinished()) {
Page nextPage = pageSource.getNextPage();
if (nextPage != null) {
pages.add(nextPage.getLoadedPage());
}
}
MaterializedResult expectedResults = toMaterializedResult(getHiveSession(config), columnTypes, ImmutableList.of(page));
MaterializedResult results = toMaterializedResult(getHiveSession(config), columnTypes, pages);
assertEquals(results, expectedResults);
assertEquals(round(stats.getInputPageSizeInBytes().getAllTime().getMax()), page.getRetainedSizeInBytes());
return length;
}
use of io.trino.spi.type.DoubleType.DOUBLE in project trino by trinodb.
the class BaseIcebergConnectorTest method testSplitPruningFromDataFileStatistics.
@Test(dataProvider = "testDataMappingSmokeTestDataProvider")
public void testSplitPruningFromDataFileStatistics(DataMappingTestSetup testSetup) {
if (testSetup.isUnsupportedType()) {
return;
}
try (TestTable table = new TestTable(getQueryRunner()::execute, "test_split_pruning_data_file_statistics", // Random double is needed to make sure rows are different. Otherwise compression may deduplicate rows, resulting in only one row group
"(col " + testSetup.getTrinoTypeName() + ", r double)")) {
String tableName = table.getName();
String values = Stream.concat(nCopies(100, testSetup.getSampleValueLiteral()).stream(), nCopies(100, testSetup.getHighValueLiteral()).stream()).map(value -> "(" + value + ", rand())").collect(Collectors.joining(", "));
assertUpdate(withSmallRowGroups(getSession()), "INSERT INTO " + tableName + " VALUES " + values, 200);
String query = "SELECT * FROM " + tableName + " WHERE col = " + testSetup.getSampleValueLiteral();
verifyPredicatePushdownDataRead(query, supportsRowGroupStatistics(testSetup.getTrinoTypeName()));
}
}
use of io.trino.spi.type.DoubleType.DOUBLE in project trino by trinodb.
the class BaseIcebergConnectorTest method testMultipleColumnTableStatistics.
@Test
public void testMultipleColumnTableStatistics() {
String tableName = "test_multiple_table_statistics";
assertUpdate(format("CREATE TABLE %s (col1 REAL, col2 INTEGER, col3 DATE)", tableName));
String insertStart = format("INSERT INTO %s", tableName);
assertUpdate(insertStart + " VALUES (-10, -1, DATE '2019-06-28')", 1);
assertUpdate(insertStart + " VALUES (100, 10, DATE '2020-01-01')", 1);
MaterializedResult result = computeActual("SHOW STATS FOR " + tableName);
MaterializedResult expectedStatistics = resultBuilder(getSession(), VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR).row("col1", null, null, 0.0, null, "-10.0", "100.0").row("col2", null, null, 0.0, null, "-1", "10").row("col3", null, null, 0.0, null, "2019-06-28", "2020-01-01").row(null, null, null, null, 2.0, null, null).build();
assertEquals(result, expectedStatistics);
assertUpdate(insertStart + " VALUES (200, 20, DATE '2020-06-28')", 1);
result = computeActual("SHOW STATS FOR " + tableName);
expectedStatistics = resultBuilder(getSession(), VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR).row("col1", null, null, 0.0, null, "-10.0", "200.0").row("col2", null, null, 0.0, null, "-1", "20").row("col3", null, null, 0.0, null, "2019-06-28", "2020-06-28").row(null, null, null, null, 3.0, null, null).build();
assertEquals(result, expectedStatistics);
assertUpdate(insertStart + " VALUES " + IntStream.rangeClosed(21, 25).mapToObj(i -> format("(200, %d, DATE '2020-07-%d')", i, i)).collect(joining(", ")), 5);
assertUpdate(insertStart + " VALUES " + IntStream.rangeClosed(26, 30).mapToObj(i -> format("(NULL, %d, DATE '2020-06-%d')", i, i)).collect(joining(", ")), 5);
result = computeActual("SHOW STATS FOR " + tableName);
expectedStatistics = resultBuilder(getSession(), VARCHAR, DOUBLE, DOUBLE, DOUBLE, DOUBLE, VARCHAR, VARCHAR).row("col1", null, null, 5.0 / 13.0, null, "-10.0", "200.0").row("col2", null, null, 0.0, null, "-1", "30").row("col3", null, null, 0.0, null, "2019-06-28", "2020-07-25").row(null, null, null, null, 13.0, null, null).build();
assertEquals(result, expectedStatistics);
dropTable(tableName);
}
Aggregations