use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class SimpleWorkflowDagGenerator method build.
@Override
public WorkflowDag build() {
DagNode root = new InsertNode(DeltaConfig.Config.newBuilder().withNumRecordsToInsert(100).withNumInsertPartitions(1).withNumTimesToRepeat(2).withRecordSize(1000).build());
DagNode child1 = new InsertNode(DeltaConfig.Config.newBuilder().withNumRecordsToInsert(100).withNumInsertPartitions(1).withNumTimesToRepeat(2).withRecordSize(1000).build());
root.addChildNode(child1);
DagNode child1OfChild1 = new UpsertNode(DeltaConfig.Config.newBuilder().withNumRecordsToUpdate(100).withNumUpsertPartitions(2).withNumTimesToRepeat(1).withRecordSize(1000).build());
// Tests running 2 nodes in parallel
child1.addChildNode(child1OfChild1);
List<Pair<String, Integer>> queryAndResult = new ArrayList<>();
queryAndResult.add(Pair.of("select " + "count(*) from testdb1.table1 group " + "by rider having count(*) < 1", 0));
DagNode child2OfChild1 = new HiveQueryNode(DeltaConfig.Config.newBuilder().withHiveQueryAndResults(queryAndResult).withHiveLocal(true).build());
child1.addChildNode(child2OfChild1);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
return new WorkflowDag(rootNodes);
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class CompactionCommand method printAllCompactions.
/**
* Prints all compaction details.
*/
private String printAllCompactions(HoodieDefaultTimeline timeline, Function<HoodieInstant, HoodieCompactionPlan> compactionPlanReader, boolean includeExtraMetadata, String sortByField, boolean descending, int limit, boolean headerOnly) {
Stream<HoodieInstant> instantsStream = timeline.getWriteTimeline().getReverseOrderedInstants();
List<Pair<HoodieInstant, HoodieCompactionPlan>> compactionPlans = instantsStream.map(instant -> Pair.of(instant, compactionPlanReader.apply(instant))).filter(pair -> pair.getRight() != null).collect(Collectors.toList());
Set<String> committedInstants = timeline.getCommitTimeline().filterCompletedInstants().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toSet());
List<Comparable[]> rows = new ArrayList<>();
for (Pair<HoodieInstant, HoodieCompactionPlan> compactionPlan : compactionPlans) {
HoodieCompactionPlan plan = compactionPlan.getRight();
HoodieInstant instant = compactionPlan.getLeft();
final HoodieInstant.State state;
if (committedInstants.contains(instant.getTimestamp())) {
state = HoodieInstant.State.COMPLETED;
} else {
state = instant.getState();
}
if (includeExtraMetadata) {
rows.add(new Comparable[] { instant.getTimestamp(), state.toString(), plan.getOperations() == null ? 0 : plan.getOperations().size(), plan.getExtraMetadata().toString() });
} else {
rows.add(new Comparable[] { instant.getTimestamp(), state.toString(), plan.getOperations() == null ? 0 : plan.getOperations().size() });
}
}
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_COMPACTION_INSTANT_TIME).addTableHeaderField(HoodieTableHeaderFields.HEADER_STATE).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_TO_BE_COMPACTED);
if (includeExtraMetadata) {
header = header.addTableHeaderField(HoodieTableHeaderFields.HEADER_EXTRA_METADATA);
}
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestInLineFileSystem method testReadInlineFile.
@Test
public void testReadInlineFile() throws IOException {
Path outerPath = getRandomOuterFSPath();
listOfGeneratedPaths.add(outerPath);
// embed n slices so that we can test N inline seqPaths
int totalSlices = 5;
List<Pair<Long, Integer>> startOffsetLengthPairs = new ArrayList<>();
List<byte[]> expectedByteArrays = new ArrayList<>();
FSDataOutputStream wrappedOut = outerPath.getFileSystem(conf).create(outerPath, true);
for (int i = 0; i < totalSlices; i++) {
// append random bytes
byte[] randomBytes = new byte[RANDOM.nextInt(1000)];
RANDOM.nextBytes(randomBytes);
wrappedOut.write(randomBytes);
long startOffset = wrappedOut.getPos();
// add inline content
byte[] embeddedInlineBytes = new byte[RANDOM.nextInt(1000)];
RANDOM.nextBytes(embeddedInlineBytes);
wrappedOut.write(embeddedInlineBytes);
expectedByteArrays.add(embeddedInlineBytes);
startOffsetLengthPairs.add(Pair.of(startOffset, embeddedInlineBytes.length));
}
// suffix random bytes
byte[] randomBytes = new byte[RANDOM.nextInt(1000)];
RANDOM.nextBytes(randomBytes);
wrappedOut.write(randomBytes);
wrappedOut.flush();
wrappedOut.close();
FileStatus expectedFileStatus = outerPath.getFileSystem(conf).getFileStatus(outerPath);
for (int i = 0; i < totalSlices; i++) {
Pair<Long, Integer> startOffsetLengthPair = startOffsetLengthPairs.get(i);
byte[] expectedBytes = expectedByteArrays.get(i);
Path inlinePath = FileSystemTestUtils.getPhantomFile(outerPath, startOffsetLengthPair.getLeft(), startOffsetLengthPair.getRight());
InLineFileSystem inlineFileSystem = (InLineFileSystem) inlinePath.getFileSystem(conf);
FSDataInputStream fsDataInputStream = inlineFileSystem.open(inlinePath);
assertTrue(inlineFileSystem.exists(inlinePath));
verifyFileStatus(expectedFileStatus, inlinePath, startOffsetLengthPair.getRight(), inlineFileSystem.getFileStatus(inlinePath));
FileStatus[] actualFileStatuses = inlineFileSystem.listStatus(inlinePath);
assertEquals(1, actualFileStatuses.length);
verifyFileStatus(expectedFileStatus, inlinePath, startOffsetLengthPair.getRight(), actualFileStatuses[0]);
byte[] actualBytes = new byte[expectedBytes.length];
fsDataInputStream.readFully(0, actualBytes);
assertArrayEquals(expectedBytes, actualBytes);
fsDataInputStream.close();
assertEquals(InLineFileSystem.SCHEME, inlineFileSystem.getScheme());
assertEquals(URI.create(InLineFileSystem.SCHEME), inlineFileSystem.getUri());
}
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class HoodieTableMetadataUtil method convertFilesToBloomFilterRecords.
/**
* Convert added and deleted files metadata to bloom filter index records.
*/
public static HoodieData<HoodieRecord> convertFilesToBloomFilterRecords(HoodieEngineContext engineContext, Map<String, List<String>> partitionToDeletedFiles, Map<String, Map<String, Long>> partitionToAppendedFiles, MetadataRecordsGenerationParams recordsGenerationParams, String instantTime) {
HoodieData<HoodieRecord> allRecordsRDD = engineContext.emptyHoodieData();
List<Pair<String, List<String>>> partitionToDeletedFilesList = partitionToDeletedFiles.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())).collect(Collectors.toList());
int parallelism = Math.max(Math.min(partitionToDeletedFilesList.size(), recordsGenerationParams.getBloomIndexParallelism()), 1);
HoodieData<Pair<String, List<String>>> partitionToDeletedFilesRDD = engineContext.parallelize(partitionToDeletedFilesList, parallelism);
HoodieData<HoodieRecord> deletedFilesRecordsRDD = partitionToDeletedFilesRDD.flatMap(partitionToDeletedFilesPair -> {
final String partitionName = partitionToDeletedFilesPair.getLeft();
final List<String> deletedFileList = partitionToDeletedFilesPair.getRight();
return deletedFileList.stream().flatMap(deletedFile -> {
if (!FSUtils.isBaseFile(new Path(deletedFile))) {
return Stream.empty();
}
final String partition = getPartition(partitionName);
return Stream.<HoodieRecord>of(HoodieMetadataPayload.createBloomFilterMetadataRecord(partition, deletedFile, instantTime, StringUtils.EMPTY_STRING, ByteBuffer.allocate(0), true));
}).iterator();
});
allRecordsRDD = allRecordsRDD.union(deletedFilesRecordsRDD);
List<Pair<String, Map<String, Long>>> partitionToAppendedFilesList = partitionToAppendedFiles.entrySet().stream().map(entry -> Pair.of(entry.getKey(), entry.getValue())).collect(Collectors.toList());
parallelism = Math.max(Math.min(partitionToAppendedFilesList.size(), recordsGenerationParams.getBloomIndexParallelism()), 1);
HoodieData<Pair<String, Map<String, Long>>> partitionToAppendedFilesRDD = engineContext.parallelize(partitionToAppendedFilesList, parallelism);
HoodieData<HoodieRecord> appendedFilesRecordsRDD = partitionToAppendedFilesRDD.flatMap(partitionToAppendedFilesPair -> {
final String partitionName = partitionToAppendedFilesPair.getLeft();
final Map<String, Long> appendedFileMap = partitionToAppendedFilesPair.getRight();
final String partition = getPartition(partitionName);
return appendedFileMap.entrySet().stream().flatMap(appendedFileLengthPairEntry -> {
final String appendedFile = appendedFileLengthPairEntry.getKey();
if (!FSUtils.isBaseFile(new Path(appendedFile))) {
return Stream.empty();
}
final String pathWithPartition = partitionName + "/" + appendedFile;
final Path appendedFilePath = new Path(recordsGenerationParams.getDataMetaClient().getBasePath(), pathWithPartition);
try (HoodieFileReader<IndexedRecord> fileReader = HoodieFileReaderFactory.getFileReader(recordsGenerationParams.getDataMetaClient().getHadoopConf(), appendedFilePath)) {
final BloomFilter fileBloomFilter = fileReader.readBloomFilter();
if (fileBloomFilter == null) {
LOG.error("Failed to read bloom filter for " + appendedFilePath);
return Stream.empty();
}
ByteBuffer bloomByteBuffer = ByteBuffer.wrap(fileBloomFilter.serializeToString().getBytes());
HoodieRecord record = HoodieMetadataPayload.createBloomFilterMetadataRecord(partition, appendedFile, instantTime, recordsGenerationParams.getBloomFilterType(), bloomByteBuffer, false);
return Stream.of(record);
} catch (IOException e) {
LOG.error("Failed to get bloom filter for file: " + appendedFilePath);
}
return Stream.empty();
}).iterator();
});
allRecordsRDD = allRecordsRDD.union(appendedFilesRecordsRDD);
return allRecordsRDD;
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class HoodieTableMetadataUtil method convertFilesToColumnStatsRecords.
/**
* Convert added and deleted action metadata to column stats index records.
*/
public static HoodieData<HoodieRecord> convertFilesToColumnStatsRecords(HoodieEngineContext engineContext, Map<String, List<String>> partitionToDeletedFiles, Map<String, Map<String, Long>> partitionToAppendedFiles, MetadataRecordsGenerationParams recordsGenerationParams) {
HoodieData<HoodieRecord> allRecordsRDD = engineContext.emptyHoodieData();
final List<String> columnsToIndex = getColumnsToIndex(recordsGenerationParams.getDataMetaClient(), recordsGenerationParams.isAllColumnStatsIndexEnabled());
final List<Pair<String, List<String>>> partitionToDeletedFilesList = partitionToDeletedFiles.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())).collect(Collectors.toList());
int parallelism = Math.max(Math.min(partitionToDeletedFilesList.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1);
final HoodieData<Pair<String, List<String>>> partitionToDeletedFilesRDD = engineContext.parallelize(partitionToDeletedFilesList, parallelism);
HoodieData<HoodieRecord> deletedFilesRecordsRDD = partitionToDeletedFilesRDD.flatMap(partitionToDeletedFilesPair -> {
final String partitionName = partitionToDeletedFilesPair.getLeft();
final String partition = getPartition(partitionName);
final List<String> deletedFileList = partitionToDeletedFilesPair.getRight();
return deletedFileList.stream().flatMap(deletedFile -> {
final String filePathWithPartition = partitionName + "/" + deletedFile;
return getColumnStats(partition, filePathWithPartition, recordsGenerationParams.getDataMetaClient(), columnsToIndex, true);
}).iterator();
});
allRecordsRDD = allRecordsRDD.union(deletedFilesRecordsRDD);
final List<Pair<String, Map<String, Long>>> partitionToAppendedFilesList = partitionToAppendedFiles.entrySet().stream().map(entry -> Pair.of(entry.getKey(), entry.getValue())).collect(Collectors.toList());
parallelism = Math.max(Math.min(partitionToAppendedFilesList.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1);
final HoodieData<Pair<String, Map<String, Long>>> partitionToAppendedFilesRDD = engineContext.parallelize(partitionToAppendedFilesList, parallelism);
HoodieData<HoodieRecord> appendedFilesRecordsRDD = partitionToAppendedFilesRDD.flatMap(partitionToAppendedFilesPair -> {
final String partitionName = partitionToAppendedFilesPair.getLeft();
final String partition = getPartition(partitionName);
final Map<String, Long> appendedFileMap = partitionToAppendedFilesPair.getRight();
return appendedFileMap.entrySet().stream().flatMap(appendedFileNameLengthEntry -> {
if (!FSUtils.isBaseFile(new Path(appendedFileNameLengthEntry.getKey())) || !appendedFileNameLengthEntry.getKey().endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
return Stream.empty();
}
final String filePathWithPartition = partitionName + "/" + appendedFileNameLengthEntry.getKey();
return getColumnStats(partition, filePathWithPartition, recordsGenerationParams.getDataMetaClient(), columnsToIndex, false);
}).iterator();
});
allRecordsRDD = allRecordsRDD.union(appendedFilesRecordsRDD);
return allRecordsRDD;
}
Aggregations