Search in sources :

Example 1 with PostPublishStep

use of org.apache.gobblin.data.management.copy.entities.PostPublishStep in project incubator-gobblin by apache.

the class HivePartitionFileSet method generateCopyEntities.

@Override
protected Collection<CopyEntity> generateCopyEntities() throws IOException {
    try (Closer closer = Closer.create()) {
        MultiTimingEvent multiTimer = closer.register(new MultiTimingEvent(this.eventSubmitter, "PartitionCopy", true));
        int stepPriority = 0;
        String fileSet = HiveCopyEntityHelper.gson.toJson(this.partition.getValues());
        List<CopyEntity> copyEntities = Lists.newArrayList();
        stepPriority = hiveCopyEntityHelper.addSharedSteps(copyEntities, fileSet, stepPriority);
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.COMPUTE_TARGETS);
        Path targetPath = hiveCopyEntityHelper.getTargetLocation(hiveCopyEntityHelper.getDataset().fs, hiveCopyEntityHelper.getTargetFs(), this.partition.getDataLocation(), Optional.of(this.partition));
        Partition targetPartition = getTargetPartition(this.partition, targetPath);
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.EXISTING_PARTITION);
        if (this.existingTargetPartition.isPresent()) {
            hiveCopyEntityHelper.getTargetPartitions().remove(this.partition.getValues());
            try {
                checkPartitionCompatibility(targetPartition, this.existingTargetPartition.get());
            } catch (IOException ioe) {
                if (hiveCopyEntityHelper.getExistingEntityPolicy() != HiveCopyEntityHelper.ExistingEntityPolicy.REPLACE_PARTITIONS && hiveCopyEntityHelper.getExistingEntityPolicy() != HiveCopyEntityHelper.ExistingEntityPolicy.REPLACE_TABLE_AND_PARTITIONS) {
                    log.error("Source and target partitions are not compatible. Aborting copy of partition " + this.partition, ioe);
                    return Lists.newArrayList();
                }
                log.warn("Source and target partitions are not compatible. Will override target partition: " + ioe.getMessage());
                log.debug("Incompatibility details: ", ioe);
                stepPriority = hiveCopyEntityHelper.addPartitionDeregisterSteps(copyEntities, fileSet, stepPriority, hiveCopyEntityHelper.getTargetTable(), this.existingTargetPartition.get());
                this.existingTargetPartition = Optional.absent();
            }
        }
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.PARTITION_SKIP_PREDICATE);
        if (hiveCopyEntityHelper.getFastPartitionSkip().isPresent() && hiveCopyEntityHelper.getFastPartitionSkip().get().apply(this)) {
            log.info(String.format("Skipping copy of partition %s due to fast partition skip predicate.", this.partition.getCompleteName()));
            return Lists.newArrayList();
        }
        HiveSpec partitionHiveSpec = new SimpleHiveSpec.Builder<>(targetPath).withTable(HiveMetaStoreUtils.getHiveTable(hiveCopyEntityHelper.getTargetTable().getTTable())).withPartition(Optional.of(HiveMetaStoreUtils.getHivePartition(targetPartition.getTPartition()))).build();
        HiveRegisterStep register = new HiveRegisterStep(hiveCopyEntityHelper.getTargetURI(), partitionHiveSpec, hiveCopyEntityHelper.getHiveRegProps());
        copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), register, stepPriority++));
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_LOCATIONS);
        HiveLocationDescriptor sourceLocation = HiveLocationDescriptor.forPartition(this.partition, hiveCopyEntityHelper.getDataset().fs, this.properties);
        HiveLocationDescriptor desiredTargetLocation = HiveLocationDescriptor.forPartition(targetPartition, hiveCopyEntityHelper.getTargetFs(), this.properties);
        Optional<HiveLocationDescriptor> existingTargetLocation = this.existingTargetPartition.isPresent() ? Optional.of(HiveLocationDescriptor.forPartition(this.existingTargetPartition.get(), hiveCopyEntityHelper.getTargetFs(), this.properties)) : Optional.<HiveLocationDescriptor>absent();
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.FULL_PATH_DIFF);
        HiveCopyEntityHelper.DiffPathSet diffPathSet = HiveCopyEntityHelper.fullPathDiff(sourceLocation, desiredTargetLocation, existingTargetLocation, Optional.<Partition>absent(), multiTimer, hiveCopyEntityHelper);
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_DELETE_UNITS);
        if (diffPathSet.pathsToDelete.size() > 0) {
            DeleteFileCommitStep deleteStep = DeleteFileCommitStep.fromPaths(hiveCopyEntityHelper.getTargetFs(), diffPathSet.pathsToDelete, hiveCopyEntityHelper.getDataset().properties);
            copyEntities.add(new PrePublishStep(fileSet, Maps.<String, String>newHashMap(), deleteStep, stepPriority++));
        }
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_COPY_UNITS);
        for (CopyableFile.Builder builder : hiveCopyEntityHelper.getCopyableFilesFromPaths(diffPathSet.filesToCopy, hiveCopyEntityHelper.getConfiguration(), Optional.of(this.partition))) {
            CopyableFile fileEntity = builder.fileSet(fileSet).checksum(new byte[0]).datasetOutputPath(desiredTargetLocation.location.toString()).build();
            this.hiveCopyEntityHelper.setCopyableFileDatasets(fileEntity);
            copyEntities.add(fileEntity);
        }
        log.info("Created {} copy entities for partition {}", copyEntities.size(), this.partition.getCompleteName());
        return copyEntities;
    }
}
Also used : Closer(com.google.common.io.Closer) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) MultiTimingEvent(org.apache.gobblin.metrics.event.MultiTimingEvent) IOException(java.io.IOException) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) HiveRegisterStep(org.apache.gobblin.hive.HiveRegisterStep) SimpleHiveSpec(org.apache.gobblin.hive.spec.SimpleHiveSpec) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) PrePublishStep(org.apache.gobblin.data.management.copy.entities.PrePublishStep) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec) SimpleHiveSpec(org.apache.gobblin.hive.spec.SimpleHiveSpec)

Example 2 with PostPublishStep

use of org.apache.gobblin.data.management.copy.entities.PostPublishStep in project incubator-gobblin by apache.

the class HiveCopyEntityHelper method addPartitionDeregisterSteps.

int addPartitionDeregisterSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority, Table table, Partition partition) throws IOException {
    int stepPriority = initialPriority;
    Collection<Path> partitionPaths = Lists.newArrayList();
    if (this.deleteMethod == DeregisterFileDeleteMethod.RECURSIVE) {
        partitionPaths = Lists.newArrayList(partition.getDataLocation());
    } else if (this.deleteMethod == DeregisterFileDeleteMethod.INPUT_FORMAT) {
        InputFormat<?, ?> inputFormat = HiveUtils.getInputFormat(partition.getTPartition().getSd());
        HiveLocationDescriptor targetLocation = new HiveLocationDescriptor(partition.getDataLocation(), inputFormat, this.targetFs, this.dataset.getProperties());
        partitionPaths = targetLocation.getPaths().keySet();
    } else if (this.deleteMethod == DeregisterFileDeleteMethod.NO_DELETE) {
        partitionPaths = Lists.newArrayList();
    }
    if (!partitionPaths.isEmpty()) {
        DeleteFileCommitStep deletePaths = DeleteFileCommitStep.fromPaths(this.targetFs, partitionPaths, this.dataset.getProperties(), table.getDataLocation());
        copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deletePaths, stepPriority++));
    }
    PartitionDeregisterStep deregister = new PartitionDeregisterStep(table.getTTable(), partition.getTPartition(), this.targetURI, this.hiveRegProps);
    copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deregister, stepPriority++));
    return stepPriority;
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionDeregisterStep(org.apache.gobblin.hive.PartitionDeregisterStep) InputFormat(org.apache.hadoop.mapred.InputFormat) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) ToString(lombok.ToString)

Example 3 with PostPublishStep

use of org.apache.gobblin.data.management.copy.entities.PostPublishStep in project incubator-gobblin by apache.

the class HiveCopyEntityHelperTest method testAddTableDeregisterSteps.

@Test
public void testAddTableDeregisterSteps() throws Exception {
    HiveDataset dataset = Mockito.mock(HiveDataset.class);
    Mockito.when(dataset.getProperties()).thenReturn(new Properties());
    HiveCopyEntityHelper helper = Mockito.mock(HiveCopyEntityHelper.class);
    Mockito.when(helper.getDeleteMethod()).thenReturn(DeregisterFileDeleteMethod.NO_DELETE);
    Mockito.when(helper.getTargetURI()).thenReturn(Optional.of("/targetURI"));
    Mockito.when(helper.getHiveRegProps()).thenReturn(new HiveRegProps(new State()));
    Mockito.when(helper.getDataset()).thenReturn(dataset);
    Mockito.when(helper.addTableDeregisterSteps(Mockito.any(List.class), Mockito.any(String.class), Mockito.anyInt(), Mockito.any(org.apache.hadoop.hive.ql.metadata.Table.class))).thenCallRealMethod();
    org.apache.hadoop.hive.ql.metadata.Table meta_table = Mockito.mock(org.apache.hadoop.hive.ql.metadata.Table.class);
    org.apache.hadoop.hive.metastore.api.Table api_table = Mockito.mock(org.apache.hadoop.hive.metastore.api.Table.class);
    Mockito.when(api_table.getDbName()).thenReturn("TestDB");
    Mockito.when(api_table.getTableName()).thenReturn("TestTable");
    Mockito.when(meta_table.getTTable()).thenReturn(api_table);
    List<CopyEntity> copyEntities = new ArrayList<CopyEntity>();
    String fileSet = "testFileSet";
    int initialPriority = 0;
    int priority = helper.addTableDeregisterSteps(copyEntities, fileSet, initialPriority, meta_table);
    Assert.assertTrue(priority == 1);
    Assert.assertTrue(copyEntities.size() == 1);
    Assert.assertTrue(copyEntities.get(0) instanceof PostPublishStep);
    PostPublishStep p = (PostPublishStep) (copyEntities.get(0));
    Assert.assertTrue(p.getStep().toString().contains("Deregister table TestDB.TestTable on Hive metastore /targetURI"));
}
Also used : Table(org.apache.hadoop.hive.ql.metadata.Table) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) ArrayList(java.util.ArrayList) Table(org.apache.hadoop.hive.ql.metadata.Table) Properties(java.util.Properties) State(org.apache.gobblin.configuration.State) ArrayList(java.util.ArrayList) List(java.util.List) HiveRegProps(org.apache.gobblin.hive.HiveRegProps) Test(org.testng.annotations.Test)

Example 4 with PostPublishStep

use of org.apache.gobblin.data.management.copy.entities.PostPublishStep in project incubator-gobblin by apache.

the class ConfigBasedDatasetTest method testGetCopyableFiles.

@Test
public void testGetCopyableFiles() throws Exception {
    String sourceDir = getClass().getClassLoader().getResource("configBasedDatasetTest/src").getFile();
    String destinationDir = getClass().getClassLoader().getResource("configBasedDatasetTest/dest").getFile();
    long sourceWatermark = 100L;
    Collection<? extends CopyEntity> copyableFiles = testGetCopyableFilesHelper(sourceDir, destinationDir, sourceWatermark, false);
    Assert.assertEquals(copyableFiles.size(), 8);
    copyableFiles = testGetCopyableFilesHelper(sourceDir, destinationDir, sourceWatermark, true);
    Assert.assertEquals(copyableFiles.size(), 6);
    Set<Path> paths = Sets.newHashSet(new Path("dir1/file2"), new Path("dir1/file1"), new Path("dir2/file1"), new Path("dir2/file3"));
    for (CopyEntity copyEntity : copyableFiles) {
        if (copyEntity instanceof CopyableFile) {
            CopyableFile file = (CopyableFile) copyEntity;
            Path originRelativePath = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(file.getOrigin().getPath()), PathUtils.getPathWithoutSchemeAndAuthority(new Path(sourceDir)));
            Path targetRelativePath = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(file.getDestination()), PathUtils.getPathWithoutSchemeAndAuthority(new Path(destinationDir)));
            Assert.assertTrue(paths.contains(originRelativePath));
            Assert.assertTrue(paths.contains(targetRelativePath));
            Assert.assertEquals(originRelativePath, targetRelativePath);
        } else if (copyEntity instanceof PrePublishStep) {
            PrePublishStep pre = (PrePublishStep) copyEntity;
            Assert.assertTrue(pre.getStep() instanceof DeleteFileCommitStep);
            // need to delete this file
            Assert.assertTrue(pre.explain().indexOf("configBasedDatasetTest/dest/dir1/file1") > 0);
        } else if (copyEntity instanceof PostPublishStep) {
            PostPublishStep post = (PostPublishStep) copyEntity;
            Assert.assertTrue(post.getStep() instanceof WatermarkMetadataGenerationCommitStep);
            Assert.assertTrue(post.explain().indexOf("dest/_metadata") > 0 && post.explain().indexOf("" + sourceWatermark) > 0);
        } else {
            throw new Exception("Wrong type");
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) PrePublishStep(org.apache.gobblin.data.management.copy.entities.PrePublishStep) Test(org.testng.annotations.Test)

Example 5 with PostPublishStep

use of org.apache.gobblin.data.management.copy.entities.PostPublishStep in project incubator-gobblin by apache.

the class HiveCopyEntityHelper method addTableDeregisterSteps.

@VisibleForTesting
protected int addTableDeregisterSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority, Table table) throws IOException {
    int stepPriority = initialPriority;
    Collection<Path> tablePaths = Lists.newArrayList();
    switch(this.getDeleteMethod()) {
        case RECURSIVE:
            tablePaths = Lists.newArrayList(table.getDataLocation());
            break;
        case INPUT_FORMAT:
            InputFormat<?, ?> inputFormat = HiveUtils.getInputFormat(table.getSd());
            HiveLocationDescriptor targetLocation = new HiveLocationDescriptor(table.getDataLocation(), inputFormat, this.getTargetFs(), this.getDataset().getProperties());
            tablePaths = targetLocation.getPaths().keySet();
            break;
        case NO_DELETE:
            tablePaths = Lists.newArrayList();
            break;
        default:
            tablePaths = Lists.newArrayList();
    }
    if (!tablePaths.isEmpty()) {
        DeleteFileCommitStep deletePaths = DeleteFileCommitStep.fromPaths(this.getTargetFs(), tablePaths, this.getDataset().getProperties(), table.getDataLocation());
        copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deletePaths, stepPriority++));
    }
    TableDeregisterStep deregister = new TableDeregisterStep(table.getTTable(), this.getTargetURI(), this.getHiveRegProps());
    copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deregister, stepPriority++));
    return stepPriority;
}
Also used : Path(org.apache.hadoop.fs.Path) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) ToString(lombok.ToString) TableDeregisterStep(org.apache.gobblin.hive.TableDeregisterStep) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

PostPublishStep (org.apache.gobblin.data.management.copy.entities.PostPublishStep)6 DeleteFileCommitStep (org.apache.gobblin.util.commit.DeleteFileCommitStep)5 Path (org.apache.hadoop.fs.Path)5 CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)4 CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)3 PrePublishStep (org.apache.gobblin.data.management.copy.entities.PrePublishStep)3 ToString (lombok.ToString)2 Test (org.testng.annotations.Test)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Closer (com.google.common.io.Closer)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Properties (java.util.Properties)1 State (org.apache.gobblin.configuration.State)1 CopyConfiguration (org.apache.gobblin.data.management.copy.CopyConfiguration)1 HiveRegProps (org.apache.gobblin.hive.HiveRegProps)1 HiveRegisterStep (org.apache.gobblin.hive.HiveRegisterStep)1 PartitionDeregisterStep (org.apache.gobblin.hive.PartitionDeregisterStep)1 TableDeregisterStep (org.apache.gobblin.hive.TableDeregisterStep)1