use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TestHoodieRealtimeRecordReader method testLogOnlyReader.
@Test
public void testLogOnlyReader() throws Exception {
// initial commit
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
String baseInstant = "100";
File partitionDir = InputFormatTestUtil.prepareNonPartitionedParquetTable(basePath, schema, 1, 100, baseInstant, HoodieTableType.MERGE_ON_READ);
FileCreateUtils.createDeltaCommit(basePath.toString(), baseInstant);
// Add the paths
FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
FileSlice fileSlice = new FileSlice("default", baseInstant, "fileid1");
try {
// update files or generate new log file
int logVersion = 1;
int baseInstantTs = Integer.parseInt(baseInstant);
String instantTime = String.valueOf(baseInstantTs + logVersion);
HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", baseInstant, instantTime, 100, 0, logVersion);
long size = writer.getCurrentSize();
writer.close();
assertTrue(size > 0, "block - size should be > 0");
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata);
// create a split with new log file(s)
fileSlice.addLogFile(new HoodieLogFile(writer.getLogFile().getPath(), size));
RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(writer.getLogFile().getFileSize(), false, 1, 1, 0, writer.getLogFile().getPath()), basePath.toString(), fileSlice.getLogFiles().collect(Collectors.toList()), false, Option.empty());
realtimeFileStatus.setMaxCommitTime(instantTime);
HoodieRealtimePath realtimePath = (HoodieRealtimePath) realtimeFileStatus.getPath();
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(realtimePath, 0, 0, new String[] { "" }), realtimePath);
JobConf newJobConf = new JobConf(baseJobConf);
List<Schema.Field> fields = schema.getFields();
setHiveColumnNameProps(fields, newJobConf, false);
// create a dummy RecordReader to be used by HoodieRealtimeRecordReader
RecordReader<NullWritable, ArrayWritable> reader = new HoodieRealtimeRecordReader(split, newJobConf, new HoodieEmptyRecordReader(split, newJobConf));
// use reader to read log file.
NullWritable key = reader.createKey();
ArrayWritable value = reader.createValue();
while (reader.next(key, value)) {
Writable[] values = value.get();
assertEquals(instantTime, values[0].toString());
key = reader.createKey();
value = reader.createValue();
}
reader.close();
} catch (Exception e) {
throw new HoodieException(e.getMessage(), e);
}
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TestHoodieHFileInputFormat method createCommitFile.
private void createCommitFile(java.nio.file.Path basePath, String commitNumber, String partitionPath) throws IOException {
List<HoodieWriteStat> writeStats = HoodieTestUtils.generateFakeHoodieWriteStat(1);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
writeStats.forEach(stat -> commitMetadata.addWriteStat(partitionPath, stat));
File file = basePath.resolve(".hoodie").resolve(commitNumber + ".commit").toFile();
file.createNewFile();
FileOutputStream fileOutputStream = new FileOutputStream(file);
fileOutputStream.write(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8));
fileOutputStream.flush();
fileOutputStream.close();
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TestHoodieParquetInputFormat method createCommitFile.
private void createCommitFile(java.nio.file.Path basePath, String commitNumber, String partitionPath) throws IOException {
List<HoodieWriteStat> writeStats = HoodieTestUtils.generateFakeHoodieWriteStat(1);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
writeStats.forEach(stat -> commitMetadata.addWriteStat(partitionPath, stat));
File file = basePath.resolve(".hoodie").resolve(commitNumber + ".commit").toFile();
file.createNewFile();
FileOutputStream fileOutputStream = new FileOutputStream(file);
fileOutputStream.write(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8));
fileOutputStream.flush();
fileOutputStream.close();
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class HoodieMergeOnReadTableInputFormat method listStatusForIncrementalMode.
/**
* Keep the logic of mor_incr_view as same as spark datasource.
* Step1: Get list of commits to be fetched based on start commit and max commits(for snapshot max commits is -1).
* Step2: Get list of affected files status for these affected file status.
* Step3: Construct HoodieTableFileSystemView based on those affected file status.
* a. Filter affected partitions based on inputPaths.
* b. Get list of fileGroups based on affected partitions by fsView.getAllFileGroups.
* Step4: Set input paths based on filtered affected partition paths. changes that amony original input paths passed to
* this method. some partitions did not have commits as part of the trimmed down list of commits and hence we need this step.
* Step5: Find candidate fileStatus, since when we get baseFileStatus from HoodieTableFileSystemView,
* the BaseFileStatus will missing file size information.
* We should use candidate fileStatus to update the size information for BaseFileStatus.
* Step6: For every file group from step3(b)
* Get 1st available base file from all file slices. then we use candidate file status to update the baseFileStatus,
* and construct RealTimeFileStatus and add it to result along with log files.
* If file group just has log files, construct RealTimeFileStatus and add it to result.
* TODO: unify the incremental view code between hive/spark-sql and spark datasource
*/
@Override
protected List<FileStatus> listStatusForIncrementalMode(JobConf job, HoodieTableMetaClient tableMetaClient, List<Path> inputPaths, String incrementalTableName) throws IOException {
List<FileStatus> result = new ArrayList<>();
Job jobContext = Job.getInstance(job);
// step1
Option<HoodieTimeline> timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient);
if (!timeline.isPresent()) {
return result;
}
HoodieTimeline commitsTimelineToReturn = HoodieInputFormatUtils.getHoodieTimelineForIncrementalQuery(jobContext, incrementalTableName, timeline.get());
Option<List<HoodieInstant>> commitsToCheck = Option.of(commitsTimelineToReturn.getInstants().collect(Collectors.toList()));
if (!commitsToCheck.isPresent()) {
return result;
}
// step2
commitsToCheck.get().sort(HoodieInstant::compareTo);
List<HoodieCommitMetadata> metadataList = commitsToCheck.get().stream().map(instant -> {
try {
return HoodieInputFormatUtils.getCommitMetadata(instant, commitsTimelineToReturn);
} catch (IOException e) {
throw new HoodieException(String.format("cannot get metadata for instant: %s", instant));
}
}).collect(Collectors.toList());
// build fileGroup from fsView
List<FileStatus> affectedFileStatus = Arrays.asList(HoodieInputFormatUtils.listAffectedFilesForCommits(job, new Path(tableMetaClient.getBasePath()), metadataList));
// step3
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(tableMetaClient, commitsTimelineToReturn, affectedFileStatus.toArray(new FileStatus[0]));
// build fileGroup from fsView
Path basePath = new Path(tableMetaClient.getBasePath());
// filter affectedPartition by inputPaths
List<String> affectedPartition = HoodieInputFormatUtils.getWritePartitionPaths(metadataList).stream().filter(k -> k.isEmpty() ? inputPaths.contains(basePath) : inputPaths.contains(new Path(basePath, k))).collect(Collectors.toList());
if (affectedPartition.isEmpty()) {
return result;
}
List<HoodieFileGroup> fileGroups = affectedPartition.stream().flatMap(partitionPath -> fsView.getAllFileGroups(partitionPath)).collect(Collectors.toList());
// step4
setInputPaths(job, affectedPartition.stream().map(p -> p.isEmpty() ? basePath.toString() : new Path(basePath, p).toString()).collect(Collectors.joining(",")));
// step5
// find all file status in partitionPaths.
FileStatus[] fileStatuses = doListStatus(job);
Map<String, FileStatus> candidateFileStatus = new HashMap<>();
for (int i = 0; i < fileStatuses.length; i++) {
String key = fileStatuses[i].getPath().toString();
candidateFileStatus.put(key, fileStatuses[i]);
}
Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt = getHoodieVirtualKeyInfo(tableMetaClient);
String maxCommitTime = fsView.getLastInstant().get().getTimestamp();
// step6
result.addAll(collectAllIncrementalFiles(fileGroups, maxCommitTime, basePath.toString(), candidateFileStatus, virtualKeyInfoOpt));
return result;
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class StatsCommand method writeAmplificationStats.
@CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many " + "records were actually written")
public String writeAmplificationStats(@CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
long totalRecordsUpserted = 0;
long totalRecordsWritten = 0;
HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
List<Comparable[]> rows = new ArrayList<>();
DecimalFormat df = new DecimalFormat("#.00");
for (HoodieInstant instantTime : timeline.getInstants().collect(Collectors.toList())) {
String waf = "0";
HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(instantTime).get(), HoodieCommitMetadata.class);
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
waf = df.format((float) commit.fetchTotalRecordsWritten() / commit.fetchTotalUpdateRecordsWritten());
}
rows.add(new Comparable[] { instantTime.getTimestamp(), commit.fetchTotalUpdateRecordsWritten(), commit.fetchTotalRecordsWritten(), waf });
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
totalRecordsWritten += commit.fetchTotalRecordsWritten();
}
String waf = "0";
if (totalRecordsUpserted > 0) {
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
}
rows.add(new Comparable[] { "Total", totalRecordsUpserted, totalRecordsWritten, waf });
TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_COMMIT_TIME).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_UPSERTED).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_WRITTEN).addTableHeaderField(HoodieTableHeaderFields.HEADER_WRITE_AMPLIFICATION_FACTOR);
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}
Aggregations