use of org.apache.hadoop.hive.ql.ddl.table.partition.add.AlterTableAddPartitionDesc in project hive by apache.
the class FSTableEvent method partitionDescriptions.
@Override
public List<AlterTableAddPartitionDesc> partitionDescriptions(ImportTableDesc tblDesc) throws SemanticException {
List<AlterTableAddPartitionDesc> descs = new ArrayList<>();
// TODO: if partitions are loaded lazily via the iterator then we will have to avoid conversion of everything here as it defeats the purpose.
for (Partition partition : metadata.getPartitions()) {
// TODO: this should ideally not create AddPartitionDesc per partition
AlterTableAddPartitionDesc partsDesc = addPartitionDesc(fromPathMetadata, tblDesc, partition);
descs.add(partsDesc);
}
return descs;
}
use of org.apache.hadoop.hive.ql.ddl.table.partition.add.AlterTableAddPartitionDesc in project hive by apache.
the class LoadPartitions method forExistingTable.
private TaskTracker forExistingTable(AlterTableAddPartitionDesc lastPartitionReplicated) throws Exception {
boolean encounteredTheLastReplicatedPartition = (lastPartitionReplicated == null);
Map<String, String> lastReplicatedPartSpec = null;
if (!encounteredTheLastReplicatedPartition) {
lastReplicatedPartSpec = lastPartitionReplicated.getPartitions().get(0).getPartSpec();
LOG.info("Start processing from partition info spec : {}", StringUtils.mapToString(lastReplicatedPartSpec));
}
Iterator<AlterTableAddPartitionDesc> partitionIterator = event.partitionDescriptions(tableDesc).iterator();
while (!encounteredTheLastReplicatedPartition && partitionIterator.hasNext()) {
AlterTableAddPartitionDesc addPartitionDesc = partitionIterator.next();
Map<String, String> currentSpec = addPartitionDesc.getPartitions().get(0).getPartSpec();
encounteredTheLastReplicatedPartition = lastReplicatedPartSpec.equals(currentSpec);
}
// Add Copy task pending for previous partition
if (PartitionState.Stage.COPY.equals(lastReplicatedStage)) {
addTasksForPartition(table, lastPartitionReplicated, lastReplicatedPartitionDesc);
}
boolean pendingPartitions = false;
while (partitionIterator.hasNext() && tracker.canAddMoreTasks()) {
pendingPartitions = true;
AlterTableAddPartitionDesc addPartitionDesc = partitionIterator.next();
AlterTableAddPartitionDesc.PartitionDesc src = addPartitionDesc.getPartitions().get(0);
// Add check point task as part of add partition
Map<String, String> partParams = new HashMap<>();
partParams.put(REPL_CHECKPOINT_KEY, context.dumpDirectory);
Path replicaWarehousePartitionLocation = locationOnReplicaWarehouse(table, src);
src.setLocation(replicaWarehousePartitionLocation.toString());
src.addPartParams(partParams);
Map<String, String> partSpec = src.getPartSpec();
ReplLoadOpType loadPtnType = getLoadPartitionType(partSpec);
switch(loadPtnType) {
case LOAD_NEW:
break;
case LOAD_REPLACE:
tracker.addDependentTask(dropPartitionTask(table, partSpec));
break;
case LOAD_SKIP:
continue;
default:
break;
}
}
if (pendingPartitions) {
addConsolidatedPartitionDesc(lastPartitionReplicated);
}
return tracker;
}
use of org.apache.hadoop.hive.ql.ddl.table.partition.add.AlterTableAddPartitionDesc in project hive by apache.
the class LoadPartitions method addConsolidatedPartitionDesc.
/**
* Get all partitions in a batch and consolidate them into single partition request.
* Also, copy relevant stats and other information from original request.
*
* @throws SemanticException
* @param lastAlterTableAddPartitionDesc
*/
private void addConsolidatedPartitionDesc(AlterTableAddPartitionDesc lastAlterTableAddPartitionDesc) throws Exception {
int maxTasks = 0;
// Load partitions equal to batch size at one go for metadata only and for external tables.
if (isMetaDataOp() || TableType.EXTERNAL_TABLE.equals(table.getTableType())) {
maxTasks = context.hiveConf.getIntVar(HiveConf.ConfVars.REPL_LOAD_PARTITIONS_BATCH_SIZE);
} else {
maxTasks = context.hiveConf.getIntVar(HiveConf.ConfVars.REPL_LOAD_PARTITIONS_WITH_DATA_COPY_BATCH_SIZE);
}
int currentPartitionCount = 0;
Iterator<AlterTableAddPartitionDesc> partitionIterator = event.partitionDescriptions(tableDesc).iterator();
// If already a set of partitions are processed as part of previous run, we skip those
if (lastAlterTableAddPartitionDesc != null) {
while (partitionIterator.hasNext()) {
currentPartitionCount++;
AlterTableAddPartitionDesc addPartitionDesc = partitionIterator.next();
if (lastAlterTableAddPartitionDesc.getPartitions().get(0).getPartSpec().equals(addPartitionDesc.getPartitions().get(0).getPartSpec())) {
break;
}
}
}
List<AlterTableAddPartitionDesc> partitionDescs = event.partitionDescriptions(tableDesc);
int totalPartitionCount = partitionDescs.size();
while (currentPartitionCount < totalPartitionCount) {
List<AlterTableAddPartitionDesc.PartitionDesc> partitions = new LinkedList<>();
int pendingPartitionCount = totalPartitionCount - currentPartitionCount;
int toPartitionCount = currentPartitionCount + Math.min(pendingPartitionCount, maxTasks);
List<AlterTableAddPartitionDesc> partitionBatch = partitionDescs.subList(currentPartitionCount, toPartitionCount);
for (AlterTableAddPartitionDesc addPartitionDesc : partitionBatch) {
AlterTableAddPartitionDesc.PartitionDesc src = addPartitionDesc.getPartitions().get(0);
Map<String, String> partParams = src.getPartParams();
if (partParams == null) {
partParams = new HashMap<>();
}
partParams.put(REPL_CHECKPOINT_KEY, context.dumpDirectory);
Path replicaWarehousePartitionLocation = locationOnReplicaWarehouse(table, src);
partitions.add(new AlterTableAddPartitionDesc.PartitionDesc(src.getPartSpec(), replicaWarehousePartitionLocation.toString(), partParams, src.getInputFormat(), src.getOutputFormat(), src.getNumBuckets(), src.getCols(), src.getSerializationLib(), src.getSerdeParams(), src.getBucketCols(), src.getSortCols(), src.getColStats(), src.getWriteId()));
}
AlterTableAddPartitionDesc consolidatedPartitionDesc = new AlterTableAddPartitionDesc(tableDesc.getDatabaseName(), tableDesc.getTableName(), true, partitions);
// don't need to add ckpt task separately. Added as part of add partition task
addPartition((toPartitionCount < totalPartitionCount), consolidatedPartitionDesc);
if (!tracker.canAddMoreTasks()) {
// updated in add partition task
return;
}
currentPartitionCount = toPartitionCount;
}
}
use of org.apache.hadoop.hive.ql.ddl.table.partition.add.AlterTableAddPartitionDesc in project hive by apache.
the class ImportSemanticAnalyzer method addSinglePartition.
private static Task<?> addSinglePartition(ImportTableDesc tblDesc, Table table, Warehouse wh, AlterTableAddPartitionDesc addPartitionDesc, ReplicationSpec replicationSpec, EximUtil.SemanticAnalyzerWrapperContext x, Long writeId, int stmtId, boolean isReplication, String dumpRoot, ReplicationMetricCollector metricCollector) throws MetaException, IOException, HiveException {
AlterTableAddPartitionDesc.PartitionDesc partSpec = addPartitionDesc.getPartitions().get(0);
boolean isSkipTrash = false;
boolean needRecycle = false;
if (shouldSkipDataCopyInReplScope(tblDesc, replicationSpec) || (tblDesc.isExternal() && tblDesc.getLocation() == null)) {
x.getLOG().debug("Adding AddPart and skipped data copy for partition " + partSpecToString(partSpec.getPartSpec()));
// addPartitionDesc already has the right partition location
@SuppressWarnings("unchecked") Task<?> addPartTask = TaskFactory.get(new DDLWork(x.getInputs(), x.getOutputs(), addPartitionDesc, isReplication, dumpRoot, metricCollector), x.getConf());
return addPartTask;
} else {
String srcLocation = partSpec.getLocation();
if (replicationSpec.isInReplicationScope() && !ReplicationSpec.Type.IMPORT.equals(replicationSpec.getReplSpecType())) {
Path partLocation = new Path(partSpec.getLocation());
Path dataDirBase = partLocation.getParent();
String bucketDir = partLocation.getName();
for (int i = 1; i < partSpec.getPartSpec().size(); i++) {
bucketDir = dataDirBase.getName() + File.separator + bucketDir;
dataDirBase = dataDirBase.getParent();
}
String relativePartDataPath = EximUtil.DATA_PATH_NAME + File.separator + bucketDir;
srcLocation = new Path(dataDirBase, relativePartDataPath).toString();
}
fixLocationInPartSpec(tblDesc, table, wh, replicationSpec, partSpec, x);
x.getLOG().debug("adding dependent CopyWork/AddPart/MoveWork for partition " + partSpecToString(partSpec.getPartSpec()) + " with source location: " + srcLocation);
Path tgtLocation = new Path(partSpec.getLocation());
LoadFileType loadFileType;
Path destPath;
if (replicationSpec.isInReplicationScope()) {
loadFileType = LoadFileType.IGNORE;
destPath = tgtLocation;
isSkipTrash = MetaStoreUtils.isSkipTrash(table.getParameters());
if (table.isTemporary()) {
needRecycle = false;
} else {
org.apache.hadoop.hive.metastore.api.Database db = x.getHive().getDatabase(table.getDbName());
needRecycle = db != null && ReplChangeManager.shouldEnableCm(db, table.getTTable());
}
} else {
loadFileType = replicationSpec.isReplace() ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING;
// Replication scope the write id will be invalid
boolean useStagingDirectory = !AcidUtils.isTransactionalTable(table.getParameters()) || replicationSpec.isInReplicationScope();
destPath = useStagingDirectory ? x.getCtx().getExternalTmpPath(tgtLocation) : new Path(tgtLocation, AcidUtils.deltaSubdir(writeId, writeId, stmtId));
}
Path moveTaskSrc = !AcidUtils.isTransactionalTable(table.getParameters()) || replicationSpec.isInReplicationScope() ? destPath : tgtLocation;
if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
Utilities.FILE_OP_LOGGER.trace("adding import work for partition with source location: " + srcLocation + "; target: " + tgtLocation + "; copy dest " + destPath + "; mm " + writeId + " for " + partSpecToString(partSpec.getPartSpec()) + ": " + (AcidUtils.isFullAcidTable(table) ? "acid" : (AcidUtils.isInsertOnlyTable(table) ? "mm" : "flat")));
}
Task<?> copyTask = null;
if (replicationSpec.isInReplicationScope()) {
boolean copyAtLoad = x.getConf().getBoolVar(HiveConf.ConfVars.REPL_RUN_DATA_COPY_TASKS_ON_TARGET);
copyTask = ReplCopyTask.getLoadCopyTask(replicationSpec, new Path(srcLocation), destPath, x.getConf(), isSkipTrash, needRecycle, copyAtLoad, dumpRoot, metricCollector);
} else {
copyTask = TaskFactory.get(new CopyWork(new Path(srcLocation), destPath, false, dumpRoot, metricCollector, isReplication));
}
Task<?> addPartTask = null;
if (x.getEventType() != DumpType.EVENT_COMMIT_TXN) {
// During replication, by the time we are applying commit transaction event, we expect
// the partition/s to be already added or altered by previous events. So no need to
// create add partition event again.
addPartTask = TaskFactory.get(new DDLWork(x.getInputs(), x.getOutputs(), addPartitionDesc, isReplication, dumpRoot, metricCollector), x.getConf());
}
MoveWork moveWork = new MoveWork(x.getInputs(), x.getOutputs(), null, null, false, dumpRoot, metricCollector, isReplication);
// See setLoadFileType and setIsAcidIow calls elsewhere for an example.
if (replicationSpec.isInReplicationScope() && AcidUtils.isTransactionalTable(tblDesc.getTblProps())) {
LoadMultiFilesDesc loadFilesWork = new LoadMultiFilesDesc(Collections.singletonList(destPath), Collections.singletonList(tgtLocation), true, null, null);
moveWork.setMultiFilesDesc(loadFilesWork);
moveWork.setNeedCleanTarget(replicationSpec.isReplace());
} else {
LoadTableDesc loadTableWork = new LoadTableDesc(moveTaskSrc, Utilities.getTableDesc(table), partSpec.getPartSpec(), loadFileType, writeId);
loadTableWork.setStmtId(stmtId);
loadTableWork.setInheritTableSpecs(false);
moveWork.setLoadTableWork(loadTableWork);
}
if (loadFileType == LoadFileType.IGNORE) {
// update which is again done in load operations as part of move task.
if (x.getEventType() == DumpType.EVENT_INSERT) {
copyTask.addDependentTask(TaskFactory.get(moveWork, x.getConf()));
} else {
if (addPartTask != null) {
copyTask.addDependentTask(addPartTask);
}
}
return copyTask;
}
Task<?> loadPartTask = TaskFactory.get(moveWork, x.getConf());
copyTask.addDependentTask(loadPartTask);
if (addPartTask != null) {
addPartTask.addDependentTask(loadPartTask);
x.getTasks().add(copyTask);
return addPartTask;
}
return copyTask;
}
}
use of org.apache.hadoop.hive.ql.ddl.table.partition.add.AlterTableAddPartitionDesc in project hive by apache.
the class ImportSemanticAnalyzer method createReplImportTasks.
/**
* Create tasks for repl import
*/
private static void createReplImportTasks(ImportTableDesc tblDesc, List<AlterTableAddPartitionDesc> partitionDescs, ReplicationSpec replicationSpec, boolean waitOnPrecursor, Table table, URI fromURI, Warehouse wh, EximUtil.SemanticAnalyzerWrapperContext x, Long writeId, int stmtId, UpdatedMetaDataTracker updatedMetadata, String dumpRoot, ReplicationMetricCollector metricCollector) throws HiveException, IOException, MetaException {
Task<?> dropTblTask = null;
WriteEntity.WriteType lockType = WriteEntity.WriteType.DDL_NO_LOCK;
boolean firstIncPending;
// Normally, on import, trying to create a table or a partition in a db that does not yet exist
// is a error condition. However, in the case of a REPL LOAD, it is possible that we are trying
// to create tasks to create a table inside a db that as-of-now does not exist, but there is
// a precursor Task waiting that will create it before this is encountered. Thus, we instantiate
// defaults and do not error out in that case.
Database parentDb = x.getHive().getDatabase(tblDesc.getDatabaseName());
if (parentDb == null) {
if (!waitOnPrecursor) {
throw new SemanticException(ErrorMsg.DATABASE_NOT_EXISTS.getMsg(tblDesc.getDatabaseName()));
}
// For warehouse level replication, if the database itself is getting created in this load, then no need to
// check for duplicate copy. Check HIVE-21197 for more detail.
firstIncPending = false;
} else {
// For database replication, get the flag from database parameter. Check HIVE-21197 for more detail.
firstIncPending = ReplUtils.isFirstIncPending(parentDb.getParameters());
}
if (table != null) {
if (!replicationSpec.allowReplacementInto(parentDb.getParameters())) {
// If the target table exists and is newer or same as current update based on repl.last.id, then just noop it.
x.getLOG().info("Table {}.{} is not replaced as it is newer than the update", tblDesc.getDatabaseName(), tblDesc.getTableName());
return;
}
// didn't set the last repl ID due to some failure.
if (x.getEventType() == DumpType.EVENT_CREATE_TABLE) {
dropTblTask = dropTableTask(table, x, replicationSpec, dumpRoot, metricCollector);
table = null;
} else if (!firstIncPending) {
// If in db pending flag is not set then check in table parameter for table level load.
// Check HIVE-21197 for more detail.
firstIncPending = ReplUtils.isFirstIncPending(table.getParameters());
}
} else {
// If table doesn't exist, allow creating a new one only if the database state is older than the update.
if ((parentDb != null) && (!replicationSpec.allowReplacementInto(parentDb.getParameters()))) {
// If the target table exists and is newer or same as current update based on repl.last.id, then just noop it.
x.getLOG().info("Table {}.{} is not created as the database is newer than the update", tblDesc.getDatabaseName(), tblDesc.getTableName());
return;
}
}
// For first incremental load just after bootstrap, we need to check for duplicate copy.
// Check HIVE-21197 for more detail.
replicationSpec.setNeedDupCopyCheck(firstIncPending);
if (updatedMetadata != null) {
updatedMetadata.set(replicationSpec.getReplicationState(), tblDesc.getDatabaseName(), tblDesc.getTableName(), null);
}
if (tblDesc.getLocation() == null) {
if (parentDb != null && !tblDesc.isExternal() && org.apache.commons.lang.StringUtils.isNotBlank(parentDb.getManagedLocationUri())) {
tblDesc.setLocation(new Path(parentDb.getManagedLocationUri(), tblDesc.getTableName()).toString());
LOG.info("Setting the location for table {} as {}", tblDesc.getTableName(), tblDesc.getLocation());
} else if (!waitOnPrecursor) {
tblDesc.setLocation(wh.getDefaultTablePath(parentDb, tblDesc.getTableName(), tblDesc.isExternal()).toString());
} else {
tblDesc.setLocation(wh.getDnsPath(wh.getDefaultTablePath(tblDesc.getDatabaseName(), tblDesc.getTableName(), tblDesc.isExternal())).toString());
}
}
/* Note: In the following section, Metadata-only import handling logic is
interleaved with regular repl-import logic. The rule of thumb being
followed here is that MD-only imports are essentially ALTERs. They do
not load data, and should not be "creating" any metadata - they should
be replacing instead. The only place it makes sense for a MD-only import
to create is in the case of a table that's been dropped and recreated,
or in the case of an unpartitioned table. In all other cases, it should
behave like a noop or a pure MD alter.
*/
if (table == null) {
if (lockType == WriteEntity.WriteType.DDL_NO_LOCK) {
lockType = WriteEntity.WriteType.DDL_SHARED;
}
table = createNewTableMetadataObject(tblDesc, true);
List<Task<?>> dependentTasks = null;
if (isPartitioned(tblDesc)) {
dependentTasks = new ArrayList<>(partitionDescs.size());
for (AlterTableAddPartitionDesc addPartitionDesc : partitionDescs) {
addPartitionDesc.setReplicationSpec(replicationSpec);
if (!replicationSpec.isMetadataOnly()) {
dependentTasks.add(addSinglePartition(tblDesc, table, wh, addPartitionDesc, replicationSpec, x, writeId, stmtId, true, dumpRoot, metricCollector));
} else {
dependentTasks.add(alterSinglePartition(tblDesc, table, wh, addPartitionDesc, replicationSpec, null, x, true, dumpRoot, metricCollector));
}
if (updatedMetadata != null) {
updatedMetadata.addPartition(table.getDbName(), table.getTableName(), addPartitionDesc.getPartitions().get(0).getPartSpec());
}
}
} else if (!replicationSpec.isMetadataOnly() && !shouldSkipDataCopyInReplScope(tblDesc, replicationSpec)) {
x.getLOG().debug("adding dependent CopyWork/MoveWork for table");
dependentTasks = Collections.singletonList(loadTable(fromURI, table, replicationSpec.isReplace(), new Path(tblDesc.getLocation()), replicationSpec, x, writeId, stmtId, dumpRoot, metricCollector));
}
// again.
if (x.getEventType() != DumpType.EVENT_COMMIT_TXN) {
// Don't set location for managed tables while creating the table.
if (x.getEventType() == DumpType.EVENT_CREATE_TABLE && !tblDesc.isExternal()) {
tblDesc.setLocation(null);
}
Task t = createTableTask(tblDesc, x, dumpRoot, metricCollector);
if (dependentTasks != null) {
dependentTasks.forEach(task -> t.addDependentTask(task));
}
if (dropTblTask != null) {
// Drop first and then create
dropTblTask.addDependentTask(t);
x.getTasks().add(dropTblTask);
} else {
// Simply create
x.getTasks().add(t);
}
} else {
// event. That should have been done when replaying create table event itself.
assert dropTblTask == null;
// Add all the tasks created above directly
if (dependentTasks != null) {
x.getTasks().addAll(dependentTasks);
}
}
} else {
// If table of current event has partition flag different from existing table, it means, some
// of the previous events in same batch have drop and create table events with same same but
// different partition flag. In this case, should go with current event's table type and so
// create the dummy table object for adding repl tasks.
boolean isOldTableValid = true;
if (table.isPartitioned() != isPartitioned(tblDesc)) {
table = createNewTableMetadataObject(tblDesc, true);
isOldTableValid = false;
}
// Table existed, and is okay to replicate into, not dropping and re-creating.
if (isPartitioned(tblDesc)) {
x.getLOG().debug("table partitioned");
for (AlterTableAddPartitionDesc addPartitionDesc : partitionDescs) {
addPartitionDesc.setReplicationSpec(replicationSpec);
Map<String, String> partSpec = addPartitionDesc.getPartitions().get(0).getPartSpec();
org.apache.hadoop.hive.ql.metadata.Partition ptn = null;
if (isOldTableValid) {
// validation and create new partition.
try {
ptn = x.getHive().getPartition(table, partSpec, false);
} catch (HiveException ex) {
ptn = null;
table = createNewTableMetadataObject(tblDesc, true);
isOldTableValid = false;
}
}
if (ptn == null) {
if (!replicationSpec.isMetadataOnly()) {
x.getTasks().add(addSinglePartition(tblDesc, table, wh, addPartitionDesc, replicationSpec, x, writeId, stmtId, true, dumpRoot, metricCollector));
if (updatedMetadata != null) {
updatedMetadata.addPartition(table.getDbName(), table.getTableName(), partSpec);
}
} else {
x.getTasks().add(alterSinglePartition(tblDesc, table, wh, addPartitionDesc, replicationSpec, null, x, true, dumpRoot, metricCollector));
if (updatedMetadata != null) {
updatedMetadata.addPartition(table.getDbName(), table.getTableName(), partSpec);
}
}
} else {
// the destination ptn's repl.last.id is older than the replacement's.
if (replicationSpec.allowReplacementInto(parentDb.getParameters())) {
if (!replicationSpec.isMetadataOnly()) {
x.getTasks().add(addSinglePartition(tblDesc, table, wh, addPartitionDesc, replicationSpec, x, writeId, stmtId, true, dumpRoot, metricCollector));
} else {
x.getTasks().add(alterSinglePartition(tblDesc, table, wh, addPartitionDesc, replicationSpec, ptn, x, true, dumpRoot, metricCollector));
}
if (updatedMetadata != null) {
updatedMetadata.addPartition(table.getDbName(), table.getTableName(), partSpec);
}
if (lockType == WriteEntity.WriteType.DDL_NO_LOCK) {
lockType = WriteEntity.WriteType.DDL_SHARED;
}
}
}
}
if (replicationSpec.isMetadataOnly() && partitionDescs.isEmpty()) {
// MD-ONLY table alter
x.getTasks().add(alterTableTask(tblDesc, x, replicationSpec, true, dumpRoot, metricCollector));
if (lockType == WriteEntity.WriteType.DDL_NO_LOCK) {
lockType = WriteEntity.WriteType.DDL_SHARED;
}
}
} else {
x.getLOG().debug("table non-partitioned");
if (!replicationSpec.isMetadataOnly()) {
// repl-imports are replace-into unless the event is insert-into
loadTable(fromURI, table, replicationSpec.isReplace(), new Path(tblDesc.getLocation()), replicationSpec, x, writeId, stmtId, dumpRoot, metricCollector);
} else {
x.getTasks().add(alterTableTask(tblDesc, x, replicationSpec, true, dumpRoot, metricCollector));
}
if (lockType == WriteEntity.WriteType.DDL_NO_LOCK) {
lockType = WriteEntity.WriteType.DDL_SHARED;
}
}
}
x.getOutputs().add(new WriteEntity(table, lockType));
}
Aggregations