use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class HoodieTestDataGenerator method genPseudoRandomUUID.
private static UUID genPseudoRandomUUID(Random r) {
byte[] bytes = new byte[16];
r.nextBytes(bytes);
bytes[6] &= 0x0f;
bytes[6] |= 0x40;
bytes[8] &= 0x3f;
bytes[8] |= 0x80;
try {
Constructor<UUID> ctor = UUID.class.getDeclaredConstructor(byte[].class);
ctor.setAccessible(true);
return ctor.newInstance((Object) bytes);
} catch (InvocationTargetException | InstantiationException | IllegalAccessException | NoSuchMethodException e) {
logger.info("Failed to generate pseudo-random UUID!");
throw new HoodieException(e);
}
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class FileCreateUtils method getBaseFileCountsForPaths.
/**
* Find total basefiles for passed in paths.
*/
public static Map<String, Long> getBaseFileCountsForPaths(String basePath, FileSystem fs, String... paths) {
Map<String, Long> toReturn = new HashMap<>();
try {
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
for (String path : paths) {
TableFileSystemView.BaseFileOnlyView fileSystemView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new org.apache.hadoop.fs.Path(path)));
toReturn.put(path, fileSystemView.getLatestBaseFiles().count());
}
return toReturn;
} catch (Exception e) {
throw new HoodieException("Error reading hoodie table as a dataframe", e);
}
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class TestHoodieRealtimeRecordReader method testLogOnlyReader.
@Test
public void testLogOnlyReader() throws Exception {
// initial commit
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
String baseInstant = "100";
File partitionDir = InputFormatTestUtil.prepareNonPartitionedParquetTable(basePath, schema, 1, 100, baseInstant, HoodieTableType.MERGE_ON_READ);
FileCreateUtils.createDeltaCommit(basePath.toString(), baseInstant);
// Add the paths
FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
FileSlice fileSlice = new FileSlice("default", baseInstant, "fileid1");
try {
// update files or generate new log file
int logVersion = 1;
int baseInstantTs = Integer.parseInt(baseInstant);
String instantTime = String.valueOf(baseInstantTs + logVersion);
HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", baseInstant, instantTime, 100, 0, logVersion);
long size = writer.getCurrentSize();
writer.close();
assertTrue(size > 0, "block - size should be > 0");
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata);
// create a split with new log file(s)
fileSlice.addLogFile(new HoodieLogFile(writer.getLogFile().getPath(), size));
RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(writer.getLogFile().getFileSize(), false, 1, 1, 0, writer.getLogFile().getPath()), basePath.toString(), fileSlice.getLogFiles().collect(Collectors.toList()), false, Option.empty());
realtimeFileStatus.setMaxCommitTime(instantTime);
HoodieRealtimePath realtimePath = (HoodieRealtimePath) realtimeFileStatus.getPath();
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(realtimePath, 0, 0, new String[] { "" }), realtimePath);
JobConf newJobConf = new JobConf(baseJobConf);
List<Schema.Field> fields = schema.getFields();
setHiveColumnNameProps(fields, newJobConf, false);
// create a dummy RecordReader to be used by HoodieRealtimeRecordReader
RecordReader<NullWritable, ArrayWritable> reader = new HoodieRealtimeRecordReader(split, newJobConf, new HoodieEmptyRecordReader(split, newJobConf));
// use reader to read log file.
NullWritable key = reader.createKey();
ArrayWritable value = reader.createValue();
while (reader.next(key, value)) {
Writable[] values = value.get();
assertEquals(instantTime, values[0].toString());
key = reader.createKey();
value = reader.createValue();
}
reader.close();
} catch (Exception e) {
throw new HoodieException(e.getMessage(), e);
}
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class HoodieRealtimeInputFormatUtils method groupLogsByBaseFile.
// Return parquet file with a list of log files in the same file group.
public static List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> groupLogsByBaseFile(Configuration conf, List<Path> partitionPaths) {
Set<Path> partitionSet = new HashSet<>(partitionPaths);
// TODO(vc): Should we handle also non-hoodie splits here?
Map<Path, HoodieTableMetaClient> partitionsToMetaClient = getTableMetaClientByPartitionPath(conf, partitionSet);
// Get all the base file and it's log files pairs in required partition paths.
List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> baseAndLogsList = new ArrayList<>();
partitionSet.forEach(partitionPath -> {
// for each partition path obtain the data & log file groupings, then map back to inputsplits
HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath);
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline());
String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath);
try {
// Both commit and delta-commits are included - pick the latest completed one
Option<HoodieInstant> latestCompletedInstant = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants().lastInstant();
Stream<FileSlice> latestFileSlices = latestCompletedInstant.map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp())).orElse(Stream.empty());
latestFileSlices.forEach(fileSlice -> {
List<HoodieLogFile> logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
baseAndLogsList.add(Pair.of(fileSlice.getBaseFile(), logFilePaths));
});
} catch (Exception e) {
throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, e);
}
});
return baseAndLogsList;
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class HoodieRealtimeRecordReaderUtils method generateProjectionSchema.
/**
* Generate a reader schema off the provided writeSchema, to just project out the provided columns.
*/
public static Schema generateProjectionSchema(Schema writeSchema, Map<String, Schema.Field> schemaFieldsMap, List<String> fieldNames) {
/**
* Avro & Presto field names seems to be case sensitive (support fields differing only in case) whereas
* Hive/Impala/SparkSQL(default) are case-insensitive. Spark allows this to be configurable using
* spark.sql.caseSensitive=true
*
* For a RT table setup with no delta-files (for a latest file-slice) -> we translate parquet schema to Avro Here
* the field-name case is dependent on parquet schema. Hive (1.x/2.x/CDH) translate column projections to
* lower-cases
*/
List<Schema.Field> projectedFields = new ArrayList<>();
for (String fn : fieldNames) {
Schema.Field field = schemaFieldsMap.get(fn.toLowerCase());
if (field == null) {
throw new HoodieException("Field " + fn + " not found in log schema. Query cannot proceed! " + "Derived Schema Fields: " + new ArrayList<>(schemaFieldsMap.keySet()));
} else {
projectedFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal()));
}
}
Schema projectedSchema = Schema.createRecord(writeSchema.getName(), writeSchema.getDoc(), writeSchema.getNamespace(), writeSchema.isError());
projectedSchema.setFields(projectedFields);
return projectedSchema;
}
Aggregations