use of org.apache.gobblin.data.management.conversion.hive.entities.QueryBasedHivePublishEntity in project incubator-gobblin by apache.
the class HiveMaterializerFromEntityQueryGenerator method generatePublishQueries.
/**
* Returns a QueryBasedHivePublishEntity which includes publish level queries and cleanup commands.
* @return QueryBasedHivePublishEntity
* @throws DataConversionException
*/
public QueryBasedHivePublishEntity generatePublishQueries() throws DataConversionException {
QueryBasedHivePublishEntity publishEntity = new QueryBasedHivePublishEntity();
List<String> publishQueries = publishEntity.getPublishQueries();
Map<String, String> publishDirectories = publishEntity.getPublishDirectories();
List<String> cleanupQueries = publishEntity.getCleanupQueries();
List<String> cleanupDirectories = publishEntity.getCleanupDirectories();
String createFinalTableDDL = HiveConverterUtils.generateCreateDuplicateTableDDL(outputDatabaseName, stagingTableName, outputTableName, outputDataLocation, Optional.of(outputDatabaseName));
publishQueries.add(createFinalTableDDL);
log.debug("Create final table DDL:\n" + createFinalTableDDL);
if (!this.supportTargetPartitioning || partitionsDDLInfo.size() == 0) {
log.debug("Snapshot directory to move: " + stagingDataLocation + " to: " + outputDataLocation);
publishDirectories.put(stagingDataLocation, outputDataLocation);
String dropStagingTableDDL = HiveAvroORCQueryGenerator.generateDropTableDDL(outputDatabaseName, stagingTableName);
log.debug("Drop staging table DDL: " + dropStagingTableDDL);
cleanupQueries.add(dropStagingTableDDL);
log.debug("Staging table directory to delete: " + stagingDataLocation);
cleanupDirectories.add(stagingDataLocation);
} else {
String finalDataPartitionLocation = outputDataLocation + Path.SEPARATOR + stagingDataPartitionDirName;
Optional<Path> destPartitionLocation = HiveConverterUtils.getDestinationPartitionLocation(destinationTableMeta, this.workUnitState, conversionEntity.getPartition().get().getName());
finalDataPartitionLocation = HiveConverterUtils.updatePartitionLocation(finalDataPartitionLocation, this.workUnitState, destPartitionLocation);
log.debug("Partition directory to move: " + stagingDataPartitionLocation + " to: " + finalDataPartitionLocation);
publishDirectories.put(stagingDataPartitionLocation, finalDataPartitionLocation);
List<String> dropPartitionsDDL = HiveAvroORCQueryGenerator.generateDropPartitionsDDL(outputDatabaseName, outputTableName, partitionsDMLInfo);
log.debug("Drop partitions if exist in final table: " + dropPartitionsDDL);
publishQueries.addAll(dropPartitionsDDL);
List<String> createFinalPartitionDDL = HiveAvroORCQueryGenerator.generateCreatePartitionDDL(outputDatabaseName, outputTableName, finalDataPartitionLocation, partitionsDMLInfo, Optional.<String>absent());
log.debug("Create final partition DDL: " + createFinalPartitionDDL);
publishQueries.addAll(createFinalPartitionDDL);
String dropStagingTableDDL = HiveAvroORCQueryGenerator.generateDropTableDDL(outputDatabaseName, stagingTableName);
log.debug("Drop staging table DDL: " + dropStagingTableDDL);
cleanupQueries.add(dropStagingTableDDL);
log.debug("Staging table directory to delete: " + stagingDataLocation);
cleanupDirectories.add(stagingDataLocation);
publishQueries.addAll(HiveAvroORCQueryGenerator.generateDropPartitionsDDL(outputDatabaseName, outputTableName, AbstractAvroToOrcConverter.getDropPartitionsDDLInfo(conversionEntity)));
}
log.info("Publish partition entity: " + publishEntity);
return publishEntity;
}
use of org.apache.gobblin.data.management.conversion.hive.entities.QueryBasedHivePublishEntity in project incubator-gobblin by apache.
the class HiveConvertPublisher method publishData.
@Override
public void publishData(Collection<? extends WorkUnitState> states) throws IOException {
Set<String> cleanUpQueries = Sets.newLinkedHashSet();
Set<String> publishQueries = Sets.newLinkedHashSet();
List<String> directoriesToDelete = Lists.newArrayList();
try {
if (Iterables.tryFind(states, UNSUCCESSFUL_WORKUNIT).isPresent()) {
// ///////////////////////////////////////
for (WorkUnitState wus : states) {
QueryBasedHivePublishEntity publishEntity = HiveAvroORCQueryGenerator.deserializePublishCommands(wus);
// Add cleanup commands - to be executed later
if (publishEntity.getCleanupQueries() != null) {
cleanUpQueries.addAll(publishEntity.getCleanupQueries());
}
if (publishEntity.getCleanupDirectories() != null) {
directoriesToDelete.addAll(publishEntity.getCleanupDirectories());
}
EventWorkunitUtils.setBeginPublishDDLExecuteTimeMetadata(wus, System.currentTimeMillis());
wus.setWorkingState(WorkingState.FAILED);
if (!wus.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY)) {
try {
new SlaEventSubmitter(eventSubmitter, EventConstants.CONVERSION_FAILED_EVENT, wus.getProperties()).submit();
} catch (Exception e) {
log.error("Failed while emitting SLA event, but ignoring and moving forward to curate " + "all clean up comamnds", e);
}
}
}
} else {
// ///////////////////////////////////////
for (WorkUnitState wus : PARTITION_PUBLISH_ORDERING.sortedCopy(states)) {
QueryBasedHivePublishEntity publishEntity = HiveAvroORCQueryGenerator.deserializePublishCommands(wus);
// Add cleanup commands - to be executed later
if (publishEntity.getCleanupQueries() != null) {
cleanUpQueries.addAll(publishEntity.getCleanupQueries());
}
if (publishEntity.getCleanupDirectories() != null) {
directoriesToDelete.addAll(publishEntity.getCleanupDirectories());
}
if (publishEntity.getPublishDirectories() != null) {
// Publish snapshot / partition directories
Map<String, String> publishDirectories = publishEntity.getPublishDirectories();
for (Map.Entry<String, String> publishDir : publishDirectories.entrySet()) {
moveDirectory(publishDir.getKey(), publishDir.getValue());
}
}
if (publishEntity.getPublishQueries() != null) {
publishQueries.addAll(publishEntity.getPublishQueries());
}
}
// Update publish start timestamp on all workunits
for (WorkUnitState wus : PARTITION_PUBLISH_ORDERING.sortedCopy(states)) {
if (HiveAvroORCQueryGenerator.deserializePublishCommands(wus).getPublishQueries() != null) {
EventWorkunitUtils.setBeginPublishDDLExecuteTimeMetadata(wus, System.currentTimeMillis());
}
}
// Actual publish: Register snapshot / partition
executeQueries(Lists.newArrayList(publishQueries));
// Update publish completion timestamp on all workunits
for (WorkUnitState wus : PARTITION_PUBLISH_ORDERING.sortedCopy(states)) {
if (HiveAvroORCQueryGenerator.deserializePublishCommands(wus).getPublishQueries() != null) {
EventWorkunitUtils.setEndPublishDDLExecuteTimeMetadata(wus, System.currentTimeMillis());
}
wus.setWorkingState(WorkingState.COMMITTED);
this.watermarker.setActualHighWatermark(wus);
// Emit an SLA event for conversion successful
if (!wus.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY)) {
EventWorkunitUtils.setIsFirstPublishMetadata(wus);
try {
new SlaEventSubmitter(eventSubmitter, EventConstants.CONVERSION_SUCCESSFUL_SLA_EVENT, wus.getProperties()).submit();
} catch (Exception e) {
log.error("Failed while emitting SLA event, but ignoring and moving forward to curate " + "all clean up commands", e);
}
}
}
}
} finally {
// ///////////////////////////////////////
// Preserving partition params
// ///////////////////////////////////////
preservePartitionParams(states);
// Execute cleanup commands
try {
executeQueries(Lists.newArrayList(cleanUpQueries));
} catch (Exception e) {
log.error("Failed to cleanup staging entities in Hive metastore.", e);
}
try {
deleteDirectories(directoriesToDelete);
} catch (Exception e) {
log.error("Failed to cleanup staging directories.", e);
}
}
}
use of org.apache.gobblin.data.management.conversion.hive.entities.QueryBasedHivePublishEntity in project incubator-gobblin by apache.
the class AbstractAvroToOrcConverter method convertRecord.
/**
* Populate the avro to orc conversion queries. The Queries will be added to {@link QueryBasedHiveConversionEntity#getQueries()}
*/
@Override
public Iterable<QueryBasedHiveConversionEntity> convertRecord(Schema outputAvroSchema, QueryBasedHiveConversionEntity conversionEntity, WorkUnitState workUnit) throws DataConversionException {
Preconditions.checkNotNull(outputAvroSchema, "Avro schema must not be null");
Preconditions.checkNotNull(conversionEntity, "Conversion entity must not be null");
Preconditions.checkNotNull(workUnit, "Workunit state must not be null");
Preconditions.checkNotNull(conversionEntity.getTable(), "Hive table within conversion entity must not be null");
EventWorkunitUtils.setBeginDDLBuildTimeMetadata(workUnit, System.currentTimeMillis());
this.hiveDataset = conversionEntity.getConvertibleHiveDataset();
if (!hasConversionConfig()) {
return new SingleRecordIterable<>(conversionEntity);
}
// Avro table name and location
String avroTableName = conversionEntity.getTable().getTableName();
// ORC table name and location
String orcTableName = getConversionConfig().getDestinationTableName();
String orcStagingTableName = getOrcStagingTableName(getConversionConfig().getDestinationStagingTableName());
String orcTableDatabase = getConversionConfig().getDestinationDbName();
String orcDataLocation = getOrcDataLocation();
String orcStagingDataLocation = getOrcStagingDataLocation(orcStagingTableName);
boolean isEvolutionEnabled = getConversionConfig().isEvolutionEnabled();
Pair<Optional<Table>, Optional<List<Partition>>> destinationMeta = HiveConverterUtils.getDestinationTableMeta(orcTableDatabase, orcTableName, workUnit.getProperties());
Optional<Table> destinationTableMeta = destinationMeta.getLeft();
// Optional
// View registration blacklist / whitelist
Optional<WhitelistBlacklist> optionalViewRegistrationWhiteBlacklist = getViewWhiteBackListFromWorkUnit(workUnit);
// wrapperViewName : If specified view with 'wrapperViewName' is created if not already exists
// over destination table
// isUpdateViewAlwaysEnabled: If false 'wrapperViewName' is only updated when schema evolves; if true
// 'wrapperViewName' is always updated (everytime publish happens)
Optional<String> wrapperViewName = Optional.<String>absent();
if (optionalViewRegistrationWhiteBlacklist.isPresent()) {
wrapperViewName = optionalViewRegistrationWhiteBlacklist.get().acceptTable(orcTableDatabase, orcTableName) ? getConversionConfig().getDestinationViewName() : wrapperViewName;
} else {
wrapperViewName = getConversionConfig().getDestinationViewName();
}
boolean shouldUpdateView = getConversionConfig().isUpdateViewAlwaysEnabled();
// Other properties
Optional<List<String>> clusterBy = getConversionConfig().getClusterBy().isEmpty() ? Optional.<List<String>>absent() : Optional.of(getConversionConfig().getClusterBy());
Optional<Integer> numBuckets = getConversionConfig().getNumBuckets();
Optional<Integer> rowLimit = getConversionConfig().getRowLimit();
Properties tableProperties = getConversionConfig().getDestinationTableProperties();
// Partition dir hint helps create different directory for hourly and daily partition with same timestamp, such as:
// .. daily_2016-01-01-00 and hourly_2016-01-01-00
// This helps existing hourly data from not being deleted at the time of roll up, and so Hive queries in flight
// .. do not fail
List<String> sourceDataPathIdentifier = getConversionConfig().getSourceDataPathIdentifier();
// Populate optional partition info
Map<String, String> partitionsDDLInfo = Maps.newHashMap();
Map<String, String> partitionsDMLInfo = Maps.newHashMap();
HiveConverterUtils.populatePartitionInfo(conversionEntity, partitionsDDLInfo, partitionsDMLInfo);
/*
* Create ORC data location with the same permissions as Avro data
*
* Note that hive can also automatically create the non-existing directories but it does not
* seem to create it with the desired permissions.
* According to hive docs permissions for newly created directories/files can be controlled using uMask like,
*
* SET hive.warehouse.subdir.inherit.perms=false;
* SET fs.permissions.umask-mode=022;
* Upon testing, this did not work
*/
try {
FileStatus sourceDataFileStatus = this.fs.getFileStatus(conversionEntity.getTable().getDataLocation());
FsPermission sourceDataPermission = sourceDataFileStatus.getPermission();
if (!this.fs.mkdirs(new Path(getConversionConfig().getDestinationDataPath()), sourceDataPermission)) {
throw new RuntimeException(String.format("Failed to create path %s with permissions %s", new Path(getConversionConfig().getDestinationDataPath()), sourceDataPermission));
} else {
this.fs.setPermission(new Path(getConversionConfig().getDestinationDataPath()), sourceDataPermission);
// Explicitly set group name for destination location if specified otherwise preserve source group name
String destinationGroupName;
if (workUnit.contains(HIVE_DATASET_DESTINATION_GROUP_NAME)) {
destinationGroupName = workUnit.getProp(HIVE_DATASET_DESTINATION_GROUP_NAME);
} else {
destinationGroupName = sourceDataFileStatus.getGroup();
}
if (!workUnit.getPropAsBoolean(HIVE_DATASET_DESTINATION_SKIP_SETGROUP, DEFAULT_HIVE_DATASET_DESTINATION_SKIP_SETGROUP)) {
this.fs.setOwner(new Path(getConversionConfig().getDestinationDataPath()), null, destinationGroupName);
}
log.info(String.format("Created %s with permissions %s and group %s", new Path(getConversionConfig().getDestinationDataPath()), sourceDataPermission, sourceDataFileStatus.getGroup()));
// Explicitly set group name for staging directory if specified
if (workUnit.contains(HIVE_DATASET_STAGING_GROUP_NAME)) {
String stagingGroupName = workUnit.getProp(HIVE_DATASET_STAGING_GROUP_NAME);
log.info("Setting staging directory group name as " + stagingGroupName);
this.fs.mkdirs(new Path(getOrcStagingDataLocation(orcStagingTableName)));
this.fs.setOwner(new Path(getOrcStagingDataLocation(orcStagingTableName)), null, stagingGroupName);
// Staging directory will be renamed to getOrcDataLocation() and hence it's group name should match
// with the group name of the staging directory
this.fs.mkdirs(new Path(getOrcDataLocation()));
this.fs.setOwner(new Path(getOrcDataLocation()), null, stagingGroupName);
}
}
} catch (IOException e) {
Throwables.propagate(e);
}
// Set hive runtime properties from conversion config
for (Map.Entry<Object, Object> entry : getConversionConfig().getHiveRuntimeProperties().entrySet()) {
conversionEntity.getQueries().add(String.format("SET %s=%s", entry.getKey(), entry.getValue()));
}
// Set hive runtime properties for tracking
conversionEntity.getQueries().add(String.format("SET %s=%s", GOBBLIN_DATASET_URN_KEY, conversionEntity.getTable().getCompleteName()));
if (conversionEntity.getPartition().isPresent()) {
conversionEntity.getQueries().add(String.format("SET %s=%s", GOBBLIN_PARTITION_NAME_KEY, conversionEntity.getPartition().get().getCompleteName()));
}
conversionEntity.getQueries().add(String.format("SET %s=%s", GOBBLIN_WORKUNIT_CREATE_TIME_KEY, workUnit.getWorkunit().getProp(SlaEventKeys.ORIGIN_TS_IN_MILLI_SECS_KEY)));
// Create DDL statement for table
Map<String, String> hiveColumns = new LinkedHashMap<>();
String createStagingTableDDL = HiveAvroORCQueryGenerator.generateCreateTableDDL(outputAvroSchema, orcStagingTableName, orcStagingDataLocation, Optional.of(orcTableDatabase), Optional.of(partitionsDDLInfo), clusterBy, Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(), numBuckets, Optional.<String>absent(), Optional.<String>absent(), Optional.<String>absent(), tableProperties, isEvolutionEnabled, destinationTableMeta, hiveColumns);
conversionEntity.getQueries().add(createStagingTableDDL);
log.debug("Create staging table DDL: " + createStagingTableDDL);
// Create DDL statement for partition
String orcStagingDataPartitionDirName = HiveConverterUtils.getStagingDataPartitionDirName(conversionEntity, sourceDataPathIdentifier);
String orcStagingDataPartitionLocation = orcStagingDataLocation + Path.SEPARATOR + orcStagingDataPartitionDirName;
if (partitionsDMLInfo.size() > 0) {
List<String> createStagingPartitionDDL = HiveAvroORCQueryGenerator.generateCreatePartitionDDL(orcTableDatabase, orcStagingTableName, orcStagingDataPartitionLocation, partitionsDMLInfo);
conversionEntity.getQueries().addAll(createStagingPartitionDDL);
log.debug("Create staging partition DDL: " + createStagingPartitionDDL);
}
// Create DML statement
String insertInORCStagingTableDML = HiveAvroORCQueryGenerator.generateTableMappingDML(conversionEntity.getHiveTable().getAvroSchema(), outputAvroSchema, avroTableName, orcStagingTableName, Optional.of(conversionEntity.getTable().getDbName()), Optional.of(orcTableDatabase), Optional.of(partitionsDMLInfo), Optional.<Boolean>absent(), Optional.<Boolean>absent(), isEvolutionEnabled, destinationTableMeta, rowLimit);
conversionEntity.getQueries().add(insertInORCStagingTableDML);
log.debug("Conversion staging DML: " + insertInORCStagingTableDML);
// TODO: Split this method into two (conversion and publish)
// Addition to WUS for Staging publish:
// A. Evolution turned on:
// 1. If table does not exists: simply create it (now it should exist)
// 2. If table exists:
// 2.1 Evolve table (alter table)
// 2.2 If snapshot table:
// 2.2.1 Delete data in final table directory
// 2.2.2 Move data from staging to final table directory
// 2.2.3 Drop this staging table and delete directories
// 2.3 If partitioned table, move partitions from staging to final table; for all partitions:
// 2.3.1 Drop if exists partition in final table
// 2.3.2 Move partition directory
// 2.3.3 Create partition with location
// 2.3.4 Drop this staging table and delete directories
// B. Evolution turned off:
// 1. If table does not exists: simply create it (now it should exist)
// 2. If table exists:
// 2.1 Do not evolve table
// 2.2 If snapshot table:
// 2.2.1 Delete data in final table directory
// 2.2.2 Move data from staging to final table directory
// 2.2.3 Drop this staging table and delete directories
// 2.3 If partitioned table, move partitions from staging to final table; for all partitions:
// 2.3.1 Drop if exists partition in final table
// 2.3.2 Move partition directory
// 2.3.3 Create partition with location
// 2.3.4 Drop this staging table and delete directories
// Note: The queries below also serve as compatibility check module before conversion, an incompatible
// .. schema throws a Runtime exeption, hence preventing further execution
QueryBasedHivePublishEntity publishEntity = new QueryBasedHivePublishEntity();
List<String> publishQueries = publishEntity.getPublishQueries();
Map<String, String> publishDirectories = publishEntity.getPublishDirectories();
List<String> cleanupQueries = publishEntity.getCleanupQueries();
List<String> cleanupDirectories = publishEntity.getCleanupDirectories();
// A.1, B.1: If table does not exists, simply create it
if (!destinationTableMeta.isPresent()) {
String createTargetTableDDL = HiveAvroORCQueryGenerator.generateCreateTableDDL(outputAvroSchema, orcTableName, orcDataLocation, Optional.of(orcTableDatabase), Optional.of(partitionsDDLInfo), clusterBy, Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(), numBuckets, Optional.<String>absent(), Optional.<String>absent(), Optional.<String>absent(), tableProperties, isEvolutionEnabled, destinationTableMeta, new HashMap<String, String>());
publishQueries.add(createTargetTableDDL);
log.debug("Create final table DDL: " + createTargetTableDDL);
}
// Step:
// A.2.1: If table pre-exists (destinationTableMeta would be present), evolve table
// B.2.1: No-op
List<String> evolutionDDLs = HiveAvroORCQueryGenerator.generateEvolutionDDL(orcStagingTableName, orcTableName, Optional.of(orcTableDatabase), Optional.of(orcTableDatabase), outputAvroSchema, isEvolutionEnabled, hiveColumns, destinationTableMeta);
log.debug("Evolve final table DDLs: " + evolutionDDLs);
EventWorkunitUtils.setEvolutionMetadata(workUnit, evolutionDDLs);
// View (if present) must be updated if evolution happens
shouldUpdateView |= evolutionDDLs.size() > 0;
publishQueries.addAll(evolutionDDLs);
if (partitionsDDLInfo.size() == 0) {
// Step:
// A.2.2, B.2.2: Snapshot table
// Step:
// A.2.2.1, B.2.2.1: Delete data in final table directory
// A.2.2.2, B.2.2.2: Move data from staging to final table directory
log.info("Snapshot directory to move: " + orcStagingDataLocation + " to: " + orcDataLocation);
publishDirectories.put(orcStagingDataLocation, orcDataLocation);
// Step:
// A.2.2.3, B.2.2.3: Drop this staging table and delete directories
String dropStagingTableDDL = HiveAvroORCQueryGenerator.generateDropTableDDL(orcTableDatabase, orcStagingTableName);
log.debug("Drop staging table DDL: " + dropStagingTableDDL);
cleanupQueries.add(dropStagingTableDDL);
// Delete: orcStagingDataLocation
log.info("Staging table directory to delete: " + orcStagingDataLocation);
cleanupDirectories.add(orcStagingDataLocation);
} else {
// Step:
// A.2.3, B.2.3: If partitioned table, move partitions from staging to final table; for all partitions:
// Step:
// A.2.3.2, B.2.3.2: Move partition directory
// Move: orcStagingDataPartitionLocation to: orcFinalDataPartitionLocation
String orcFinalDataPartitionLocation = orcDataLocation + Path.SEPARATOR + orcStagingDataPartitionDirName;
Optional<Path> destPartitionLocation = getDestinationPartitionLocation(destinationTableMeta, workUnit, conversionEntity.getPartition().get().getName());
orcFinalDataPartitionLocation = HiveConverterUtils.updatePartitionLocation(orcFinalDataPartitionLocation, workUnit, destPartitionLocation);
log.info("Partition directory to move: " + orcStagingDataPartitionLocation + " to: " + orcFinalDataPartitionLocation);
publishDirectories.put(orcStagingDataPartitionLocation, orcFinalDataPartitionLocation);
// Step:
// A.2.3.1, B.2.3.1: Drop if exists partition in final table
// Step:
// If destination partition already exists, alter the partition location
// A.2.3.3, B.2.3.3: Create partition with location (and update storage format if not in ORC already)
List<String> dropPartitionsDDL = HiveAvroORCQueryGenerator.generateDropPartitionsDDL(orcTableDatabase, orcTableName, partitionsDMLInfo);
log.debug("Drop partitions if exist in final table: " + dropPartitionsDDL);
publishQueries.addAll(dropPartitionsDDL);
if (workUnit.getPropAsBoolean(HIVE_CONVERSION_SETSERDETOAVROEXPLICITELY, DEFAULT_HIVE_CONVERSION_SETSERDETOAVROEXPLICITELY)) {
List<String> createFinalPartitionDDL = HiveAvroORCQueryGenerator.generateCreatePartitionDDL(orcTableDatabase, orcTableName, orcFinalDataPartitionLocation, partitionsDMLInfo, Optional.<String>absent());
log.debug("Create final partition DDL: " + createFinalPartitionDDL);
publishQueries.addAll(createFinalPartitionDDL);
// Updating storage format non-transactionally is a stop gap measure until Hive supports transactionally update
// .. storage format in ADD PARITTION command (today it only supports specifying location)
List<String> updatePartitionStorageFormatDDL = HiveAvroORCQueryGenerator.generateAlterTableOrPartitionStorageFormatDDL(orcTableDatabase, orcTableName, Optional.of(partitionsDMLInfo), ORC_FORMAT);
log.debug("Update final partition storage format to ORC (if not already in ORC)");
publishQueries.addAll(updatePartitionStorageFormatDDL);
} else {
List<String> createFinalPartitionDDL = HiveAvroORCQueryGenerator.generateCreatePartitionDDL(orcTableDatabase, orcTableName, orcFinalDataPartitionLocation, partitionsDMLInfo, Optional.fromNullable(ORC_FORMAT));
log.debug("Create final partition DDL: " + createFinalPartitionDDL);
publishQueries.addAll(createFinalPartitionDDL);
}
// Step:
// A.2.3.4, B.2.3.4: Drop this staging table and delete directories
String dropStagingTableDDL = HiveAvroORCQueryGenerator.generateDropTableDDL(orcTableDatabase, orcStagingTableName);
log.debug("Drop staging table DDL: " + dropStagingTableDDL);
cleanupQueries.add(dropStagingTableDDL);
// Delete: orcStagingDataLocation
log.info("Staging table directory to delete: " + orcStagingDataLocation);
cleanupDirectories.add(orcStagingDataLocation);
}
/*
* Drop the replaced partitions if any. This is required in case the partition being converted is derived from
* several other partitions. E.g. Daily partition is a replacement of hourly partitions of the same day. When daily
* partition is converted to ORC all it's hourly ORC partitions need to be dropped.
*/
publishQueries.addAll(HiveAvroORCQueryGenerator.generateDropPartitionsDDL(orcTableDatabase, orcTableName, getDropPartitionsDDLInfo(conversionEntity)));
/*
* Create or update view over the ORC table if specified in the config (ie. wrapper view name is present in config)
*/
if (wrapperViewName.isPresent()) {
String viewName = wrapperViewName.get();
List<String> createOrUpdateViewDDLs = HiveAvroORCQueryGenerator.generateCreateOrUpdateViewDDL(orcTableDatabase, orcTableName, orcTableDatabase, viewName, shouldUpdateView);
log.debug("Create or update View DDLs: " + createOrUpdateViewDDLs);
publishQueries.addAll(createOrUpdateViewDDLs);
}
HiveAvroORCQueryGenerator.serializePublishCommands(workUnit, publishEntity);
log.debug("Publish partition entity: " + publishEntity);
log.debug("Conversion Query " + conversionEntity.getQueries());
EventWorkunitUtils.setEndDDLBuildTimeMetadata(workUnit, System.currentTimeMillis());
return new SingleRecordIterable<>(conversionEntity);
}
use of org.apache.gobblin.data.management.conversion.hive.entities.QueryBasedHivePublishEntity in project incubator-gobblin by apache.
the class QueryBasedMaterializerQueryGenerator method generatePublishQueries.
@Override
public QueryBasedHivePublishEntity generatePublishQueries() throws DataConversionException {
QueryBasedHivePublishEntity publishEntity = new QueryBasedHivePublishEntity();
List<String> publishQueries = publishEntity.getPublishQueries();
Map<String, String> publishDirectories = publishEntity.getPublishDirectories();
List<String> cleanupQueries = publishEntity.getCleanupQueries();
List<String> cleanupDirectories = publishEntity.getCleanupDirectories();
String createFinalTableDDL = HiveConverterUtils.generateCreateDuplicateTableDDL(outputDatabaseName, stagingTableName, outputTableName, outputDataLocation, Optional.of(outputDatabaseName));
publishQueries.add(createFinalTableDDL);
log.debug("Create final table DDL:\n" + createFinalTableDDL);
log.debug("Snapshot directory to move: " + stagingDataLocation + " to: " + outputDataLocation);
publishDirectories.put(stagingDataLocation, outputDataLocation);
String dropStagingTableDDL = HiveAvroORCQueryGenerator.generateDropTableDDL(outputDatabaseName, stagingTableName);
log.debug("Drop staging table DDL: " + dropStagingTableDDL);
cleanupQueries.add(dropStagingTableDDL);
log.debug("Staging table directory to delete: " + stagingDataLocation);
cleanupDirectories.add(stagingDataLocation);
publishQueries.addAll(HiveAvroORCQueryGenerator.generateDropPartitionsDDL(outputDatabaseName, outputTableName, new HashMap<>()));
log.info("Publish partition entity: " + publishEntity);
return publishEntity;
}
Aggregations