use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class HoodieSparkBootstrapSchemaProvider method getBootstrapSourceSchemaOrc.
private static Schema getBootstrapSourceSchemaOrc(HoodieWriteConfig writeConfig, HoodieEngineContext context, Path filePath) {
Reader orcReader = null;
try {
orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(context.getHadoopConf().get()));
} catch (IOException e) {
throw new HoodieException("Could not determine schema from the data files.");
}
TypeDescription orcSchema = orcReader.getSchema();
String tableName = HoodieAvroUtils.sanitizeName(writeConfig.getTableName());
String structName = tableName + "_record";
String recordNamespace = "hoodie." + tableName;
return AvroOrcUtils.createAvroSchemaWithDefaultValue(orcSchema, structName, recordNamespace, true);
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class ColumnStatsIndexHelper method buildColumnStatsTableFor.
/**
* Parse min/max statistics from Parquet footers for provided columns and composes column-stats
* index table in the following format with 3 statistics denominated for each
* linear/Z-curve/Hilbert-curve-ordered column. For ex, if original table contained
* column {@code A}:
*
* <pre>
* +---------------------------+------------+------------+-------------+
* | file | A_minValue | A_maxValue | A_num_nulls |
* +---------------------------+------------+------------+-------------+
* | one_base_file.parquet | 1 | 10 | 0 |
* | another_base_file.parquet | -10 | 0 | 5 |
* +---------------------------+------------+------------+-------------+
* </pre>
*
* NOTE: Currently {@link TimestampType} is not supported, since Parquet writer
* does not support statistics for it.
*
* TODO leverage metadata table after RFC-27 lands
* @VisibleForTesting
*
* @param sparkSession encompassing Spark session
* @param baseFilesPaths list of base-files paths to be sourced for column-stats index
* @param orderedColumnSchemas target ordered columns
* @return Spark's {@link Dataset} holding an index table
*/
@Nonnull
public static Dataset<Row> buildColumnStatsTableFor(@Nonnull SparkSession sparkSession, @Nonnull List<String> baseFilesPaths, @Nonnull List<StructField> orderedColumnSchemas) {
SparkContext sc = sparkSession.sparkContext();
JavaSparkContext jsc = new JavaSparkContext(sc);
SerializableConfiguration serializableConfiguration = new SerializableConfiguration(sc.hadoopConfiguration());
int numParallelism = (baseFilesPaths.size() / 3 + 1);
List<HoodieColumnRangeMetadata<Comparable>> colMinMaxInfos;
String previousJobDescription = sc.getLocalProperty(SPARK_JOB_DESCRIPTION);
try {
jsc.setJobDescription("Listing parquet column statistics");
colMinMaxInfos = jsc.parallelize(baseFilesPaths, numParallelism).mapPartitions(paths -> {
ParquetUtils utils = (ParquetUtils) BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
Iterable<String> iterable = () -> paths;
return StreamSupport.stream(iterable.spliterator(), false).flatMap(path -> utils.readRangeFromParquetMetadata(serializableConfiguration.value(), new Path(path), orderedColumnSchemas.stream().map(StructField::name).collect(Collectors.toList())).stream()).iterator();
}).collect();
} finally {
jsc.setJobDescription(previousJobDescription);
}
// Group column's metadata by file-paths of the files it belongs to
Map<String, List<HoodieColumnRangeMetadata<Comparable>>> filePathToColumnMetadataMap = colMinMaxInfos.stream().collect(Collectors.groupingBy(HoodieColumnRangeMetadata::getFilePath));
JavaRDD<Row> allMetaDataRDD = jsc.parallelize(new ArrayList<>(filePathToColumnMetadataMap.values()), 1).map(fileColumnsMetadata -> {
int colSize = fileColumnsMetadata.size();
if (colSize == 0) {
return null;
}
String filePath = fileColumnsMetadata.get(0).getFilePath();
List<Object> indexRow = new ArrayList<>();
// First columns of the Z-index's row is target file-path
indexRow.add(filePath);
// For each column
orderedColumnSchemas.forEach(colSchema -> {
String colName = colSchema.name();
HoodieColumnRangeMetadata<Comparable> colMetadata = fileColumnsMetadata.stream().filter(s -> s.getColumnName().trim().equalsIgnoreCase(colName)).findFirst().orElse(null);
DataType colType = colSchema.dataType();
if (colMetadata == null || colType == null) {
throw new HoodieException(String.format("Cannot collect min/max statistics for column (%s)", colSchema));
}
Pair<Object, Object> minMaxValue = fetchMinMaxValues(colType, colMetadata);
// min
indexRow.add(minMaxValue.getLeft());
// max
indexRow.add(minMaxValue.getRight());
indexRow.add(colMetadata.getNullCount());
});
return Row$.MODULE$.apply(JavaConversions.asScalaBuffer(indexRow));
}).filter(Objects::nonNull);
StructType indexSchema = composeIndexSchema(orderedColumnSchemas);
return sparkSession.createDataFrame(allMetaDataRDD, indexSchema);
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class TestIncrementalFSViewSync method testRestore.
private void testRestore(SyncableFileSystemView view, List<String> newRestoreInstants, Map<String, List<String>> instantsToFiles, List<HoodieInstant> rolledBackInstants, String emptyRestoreInstant, boolean isRestore, int totalReplacedFileSlicesPerPartition, int totalFilesAddedPerPartitionPerInstant) {
assertEquals(newRestoreInstants.size(), rolledBackInstants.size());
long initialFileSlices = partitions.stream().mapToLong(p -> view.getAllFileSlices(p).count()).findAny().getAsLong();
final int numFileSlicesAddedPerInstant = (totalFilesAddedPerPartitionPerInstant - totalReplacedFileSlicesPerPartition);
final long expectedLatestFileSlices = fileIdsPerPartition.size() + (rolledBackInstants.size()) * numFileSlicesAddedPerInstant;
IntStream.range(0, newRestoreInstants.size()).forEach(idx -> {
HoodieInstant instant = rolledBackInstants.get(idx);
try {
boolean isDeltaCommit = HoodieTimeline.DELTA_COMMIT_ACTION.equalsIgnoreCase(instant.getAction());
performRestore(instant, instantsToFiles.get(instant.getTimestamp()), newRestoreInstants.get(idx), isRestore);
final long expTotalFileSlicesPerPartition = isDeltaCommit ? initialFileSlices : initialFileSlices - ((idx + 1) * (fileIdsPerPartition.size() - totalReplacedFileSlicesPerPartition));
view.sync();
assertTrue(view.getLastInstant().isPresent());
LOG.info("Last Instant is :" + view.getLastInstant().get());
if (isRestore) {
assertEquals(newRestoreInstants.get(idx), view.getLastInstant().get().getTimestamp());
assertEquals(HoodieTimeline.RESTORE_ACTION, view.getLastInstant().get().getAction());
}
assertEquals(State.COMPLETED, view.getLastInstant().get().getState());
if (HoodieTimeline.compareTimestamps(newRestoreInstants.get(idx), HoodieTimeline.GREATER_THAN_OR_EQUALS, emptyRestoreInstant)) {
partitions.forEach(p -> assertEquals(0, view.getLatestFileSlices(p).count()));
} else {
partitions.forEach(p -> assertEquals(expectedLatestFileSlices - (idx + 1) * numFileSlicesAddedPerInstant, view.getLatestFileSlices(p).count()));
}
partitions.forEach(p -> assertEquals(expTotalFileSlicesPerPartition, view.getAllFileSlices(p).count()));
metaClient.reloadActiveTimeline();
SyncableFileSystemView newView = getFileSystemView(metaClient);
areViewsConsistent(view, newView, expTotalFileSlicesPerPartition * partitions.size());
} catch (IOException e) {
throw new HoodieException(e);
}
});
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class TestIncrementalFSViewSync method generateDataForInstant.
private List<Pair<String, HoodieWriteStat>> generateDataForInstant(String baseInstant, String instant, boolean deltaCommit, List<String> fileIds) {
return partitions.stream().flatMap(p -> fileIds.stream().map(f -> {
try {
File file = new File(basePath + "/" + p + "/" + (deltaCommit ? FSUtils.makeLogFileName(f, ".log", baseInstant, Integer.parseInt(instant), TEST_WRITE_TOKEN) : FSUtils.makeDataFileName(instant, TEST_WRITE_TOKEN, f)));
file.createNewFile();
HoodieWriteStat w = new HoodieWriteStat();
w.setFileId(f);
w.setPath(String.format("%s/%s", p, file.getName()));
return Pair.of(p, w);
} catch (IOException e) {
throw new HoodieException(e);
}
})).collect(Collectors.toList());
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class TestHoodieActiveTimeline method testCreateNewInstantTime.
@Test
public void testCreateNewInstantTime() throws Exception {
String lastInstantTime = HoodieActiveTimeline.createNewInstantTime();
for (int i = 0; i < 3; ++i) {
String newInstantTime = HoodieActiveTimeline.createNewInstantTime();
assertTrue(HoodieTimeline.compareTimestamps(lastInstantTime, HoodieTimeline.LESSER_THAN, newInstantTime));
lastInstantTime = newInstantTime;
}
// All zero timestamp can be parsed
HoodieActiveTimeline.parseDateFromInstantTime("00000000000000");
// Multiple thread test
final int numChecks = 100000;
final int numThreads = 100;
final long milliSecondsInYear = 365 * 24 * 3600 * 1000;
ExecutorService executorService = Executors.newFixedThreadPool(numThreads);
List<Future> futures = new ArrayList<>(numThreads);
for (int idx = 0; idx < numThreads; ++idx) {
futures.add(executorService.submit(() -> {
Date date = new Date(System.currentTimeMillis() + (int) (Math.random() * numThreads) * milliSecondsInYear);
final String expectedFormat = HoodieActiveTimeline.formatDate(date);
for (int tidx = 0; tidx < numChecks; ++tidx) {
final String curFormat = HoodieActiveTimeline.formatDate(date);
if (!curFormat.equals(expectedFormat)) {
throw new HoodieException("Format error: expected=" + expectedFormat + ", curFormat=" + curFormat);
}
}
}));
}
executorService.shutdown();
assertTrue(executorService.awaitTermination(10, TimeUnit.SECONDS));
// required to catch exceptions
for (Future f : futures) {
f.get();
}
}
Aggregations