use of org.apache.hudi.exception.InvalidTableException in project hudi by apache.
the class TableSchemaResolver method getLatestSchema.
/**
* Get latest schema either from incoming schema or table schema.
* @param writeSchema incoming batch's write schema.
* @param convertTableSchemaToAddNamespace {@code true} if table schema needs to be converted. {@code false} otherwise.
* @param converterFn converter function to be called over table schema (to add namespace may be). Each caller can decide if any conversion is required.
* @return the latest schema.
*/
public Schema getLatestSchema(Schema writeSchema, boolean convertTableSchemaToAddNamespace, Function1<Schema, Schema> converterFn) {
Schema latestSchema = writeSchema;
try {
if (metaClient.isTimelineNonEmpty()) {
Schema tableSchema = getTableAvroSchemaWithoutMetadataFields();
if (convertTableSchemaToAddNamespace && converterFn != null) {
tableSchema = converterFn.apply(tableSchema);
}
if (writeSchema.getFields().size() < tableSchema.getFields().size() && isSchemaCompatible(writeSchema, tableSchema)) {
// if incoming schema is a subset (old schema) compared to table schema. For eg, one of the
// ingestion pipeline is still producing events in old schema
latestSchema = tableSchema;
LOG.debug("Using latest table schema to rewrite incoming records " + tableSchema.toString());
}
}
} catch (IllegalArgumentException | InvalidTableException e) {
LOG.warn("Could not find any commits, falling back to using incoming batch's write schema");
} catch (Exception e) {
LOG.warn("Unknown exception thrown " + e.getMessage() + ", Falling back to using incoming batch's write schema");
}
return latestSchema;
}
use of org.apache.hudi.exception.InvalidTableException in project hudi by apache.
the class InputPathHandler method parseInputPaths.
/**
* Takes in the original InputPaths and classifies each of them into incremental, snapshot and
* non-hoodie InputPaths. The logic is as follows:
* 1. Check if an inputPath starts with the same basePath as any of the metadata basePaths we know
* 1a. If yes, this belongs to a Hoodie table that we already know about. Simply classify this
* as incremental or snapshot - We can get the table name of this inputPath from the
* metadata. Then based on the list of incrementalTables, we can classify this inputPath.
* 1b. If no, this could be a new Hoodie Table we haven't seen yet or a non-Hoodie Input Path.
* Try creating the HoodieTableMetadataClient.
* - If it succeeds, further classify as incremental on snapshot as described in step
* 1a above.
* - If DatasetNotFoundException/InvalidDatasetException is caught, this is a
* non-Hoodie inputPath
* @param inputPaths - InputPaths from the original jobConf that was passed to HoodieInputFormat
* @param incrementalTables - List of all incremental tables extracted from the config
* `hoodie.<table-name>.consume.mode=INCREMENTAL`
* @throws IOException
*/
private void parseInputPaths(Path[] inputPaths, List<String> incrementalTables) throws IOException {
for (Path inputPath : inputPaths) {
boolean basePathKnown = false;
for (HoodieTableMetaClient metaClient : tableMetaClientMap.values()) {
if (inputPath.toString().contains(metaClient.getBasePath())) {
// We already know the base path for this inputPath.
basePathKnown = true;
// Check if this is for a snapshot query
tagAsIncrementalOrSnapshot(inputPath, metaClient, incrementalTables);
break;
}
}
if (!basePathKnown) {
// This path is for a table that we don't know about yet.
HoodieTableMetaClient metaClient;
try {
metaClient = getTableMetaClientForBasePathUnchecked(conf, inputPath);
tableMetaClientMap.put(getIncrementalTable(metaClient), metaClient);
tagAsIncrementalOrSnapshot(inputPath, metaClient, incrementalTables);
} catch (TableNotFoundException | InvalidTableException e) {
// This is a non Hoodie inputPath
LOG.info("Handling a non-hoodie path " + inputPath);
nonHoodieInputPaths.add(inputPath);
}
}
}
}
use of org.apache.hudi.exception.InvalidTableException in project hudi by apache.
the class TableSchemaResolver method getTableParquetSchemaFromDataFile.
/**
* Gets the schema for a hoodie table. Depending on the type of table, read from any file written in the latest
* commit. We will assume that the schema has not changed within a single atomic write.
*
* @return Parquet schema for this table
*/
private MessageType getTableParquetSchemaFromDataFile() {
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata = activeTimeline.getLastCommitMetadataWithValidData();
try {
switch(metaClient.getTableType()) {
case COPY_ON_WRITE:
// For COW table, the file has data written must be in parquet or orc format currently.
if (instantAndCommitMetadata.isPresent()) {
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
return readSchemaFromBaseFile(filePath);
} else {
throw new IllegalArgumentException("Could not find any data file written for commit, " + "so could not get schema for table " + metaClient.getBasePath());
}
case MERGE_ON_READ:
// Determine the file format based on the file name, and then extract schema from it.
if (instantAndCommitMetadata.isPresent()) {
HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny().get();
if (filePath.contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) {
// this is a log file
return readSchemaFromLogFile(new Path(filePath));
} else {
return readSchemaFromBaseFile(filePath);
}
} else {
throw new IllegalArgumentException("Could not find any data file written for commit, " + "so could not get schema for table " + metaClient.getBasePath());
}
default:
LOG.error("Unknown table type " + metaClient.getTableType());
throw new InvalidTableException(metaClient.getBasePath());
}
} catch (IOException e) {
throw new HoodieException("Failed to read data schema", e);
}
}
Aggregations