Search in sources :

Example 6 with CopyEntity

use of org.apache.gobblin.data.management.copy.CopyEntity in project incubator-gobblin by apache.

the class HiveCopyEntityHelperTest method testAddTableDeregisterSteps.

@Test
public void testAddTableDeregisterSteps() throws Exception {
    HiveDataset dataset = Mockito.mock(HiveDataset.class);
    Mockito.when(dataset.getProperties()).thenReturn(new Properties());
    HiveCopyEntityHelper helper = Mockito.mock(HiveCopyEntityHelper.class);
    Mockito.when(helper.getDeleteMethod()).thenReturn(DeregisterFileDeleteMethod.NO_DELETE);
    Mockito.when(helper.getTargetURI()).thenReturn(Optional.of("/targetURI"));
    Mockito.when(helper.getHiveRegProps()).thenReturn(new HiveRegProps(new State()));
    Mockito.when(helper.getDataset()).thenReturn(dataset);
    Mockito.when(helper.addTableDeregisterSteps(Mockito.any(List.class), Mockito.any(String.class), Mockito.anyInt(), Mockito.any(org.apache.hadoop.hive.ql.metadata.Table.class))).thenCallRealMethod();
    org.apache.hadoop.hive.ql.metadata.Table meta_table = Mockito.mock(org.apache.hadoop.hive.ql.metadata.Table.class);
    org.apache.hadoop.hive.metastore.api.Table api_table = Mockito.mock(org.apache.hadoop.hive.metastore.api.Table.class);
    Mockito.when(api_table.getDbName()).thenReturn("TestDB");
    Mockito.when(api_table.getTableName()).thenReturn("TestTable");
    Mockito.when(meta_table.getTTable()).thenReturn(api_table);
    List<CopyEntity> copyEntities = new ArrayList<CopyEntity>();
    String fileSet = "testFileSet";
    int initialPriority = 0;
    int priority = helper.addTableDeregisterSteps(copyEntities, fileSet, initialPriority, meta_table);
    Assert.assertTrue(priority == 1);
    Assert.assertTrue(copyEntities.size() == 1);
    Assert.assertTrue(copyEntities.get(0) instanceof PostPublishStep);
    PostPublishStep p = (PostPublishStep) (copyEntities.get(0));
    Assert.assertTrue(p.getStep().toString().contains("Deregister table TestDB.TestTable on Hive metastore /targetURI"));
}
Also used : Table(org.apache.hadoop.hive.ql.metadata.Table) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) ArrayList(java.util.ArrayList) Table(org.apache.hadoop.hive.ql.metadata.Table) Properties(java.util.Properties) State(org.apache.gobblin.configuration.State) ArrayList(java.util.ArrayList) List(java.util.List) HiveRegProps(org.apache.gobblin.hive.HiveRegProps) Test(org.testng.annotations.Test)

Example 7 with CopyEntity

use of org.apache.gobblin.data.management.copy.CopyEntity in project incubator-gobblin by apache.

the class DeletingCopyDataPublisherTest method testDeleteOnSource.

@Test
public void testDeleteOnSource() throws Exception {
    State state = getTestState("testDeleteOnSource");
    Path testMethodTempPath = new Path(testClassTempPath, "testDeleteOnSource");
    DeletingCopyDataPublisher copyDataPublisher = closer.register(new DeletingCopyDataPublisher(state));
    File outputDir = new File(testMethodTempPath.toString(), "task-output/jobid/1f042f494d1fe2198e0e71a17faa233f33b5099b");
    outputDir.mkdirs();
    outputDir.deleteOnExit();
    WorkUnitState wus = new WorkUnitState();
    CopyableDataset copyableDataset = new TestCopyableDataset(new Path("origin"));
    CopyableDatasetMetadata metadata = new CopyableDatasetMetadata(copyableDataset);
    CopyEntity cf = CopyableFileUtils.createTestCopyableFile(new Path(testMethodTempPath, "test.txt").toString());
    CopySource.serializeCopyableDataset(wus, metadata);
    CopySource.serializeCopyEntity(wus, cf);
    Assert.assertTrue(fs.exists(new Path(testMethodTempPath, "test.txt")));
    wus.setWorkingState(WorkingState.SUCCESSFUL);
    copyDataPublisher.publishData(ImmutableList.of(wus));
    Assert.assertFalse(fs.exists(new Path(testMethodTempPath, "test.txt")));
}
Also used : Path(org.apache.hadoop.fs.Path) TestCopyableDataset(org.apache.gobblin.data.management.copy.TestCopyableDataset) CopyableDataset(org.apache.gobblin.data.management.copy.CopyableDataset) TestCopyableDataset(org.apache.gobblin.data.management.copy.TestCopyableDataset) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkingState(org.apache.gobblin.configuration.WorkUnitState.WorkingState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) CopyableDatasetMetadata(org.apache.gobblin.data.management.copy.CopyableDatasetMetadata) File(java.io.File) Test(org.testng.annotations.Test)

Example 8 with CopyEntity

use of org.apache.gobblin.data.management.copy.CopyEntity in project incubator-gobblin by apache.

the class ConfigBasedDatasetTest method testGetCopyableFiles.

@Test
public void testGetCopyableFiles() throws Exception {
    String sourceDir = getClass().getClassLoader().getResource("configBasedDatasetTest/src").getFile();
    String destinationDir = getClass().getClassLoader().getResource("configBasedDatasetTest/dest").getFile();
    long sourceWatermark = 100L;
    Collection<? extends CopyEntity> copyableFiles = testGetCopyableFilesHelper(sourceDir, destinationDir, sourceWatermark, false);
    Assert.assertEquals(copyableFiles.size(), 8);
    copyableFiles = testGetCopyableFilesHelper(sourceDir, destinationDir, sourceWatermark, true);
    Assert.assertEquals(copyableFiles.size(), 6);
    Set<Path> paths = Sets.newHashSet(new Path("dir1/file2"), new Path("dir1/file1"), new Path("dir2/file1"), new Path("dir2/file3"));
    for (CopyEntity copyEntity : copyableFiles) {
        if (copyEntity instanceof CopyableFile) {
            CopyableFile file = (CopyableFile) copyEntity;
            Path originRelativePath = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(file.getOrigin().getPath()), PathUtils.getPathWithoutSchemeAndAuthority(new Path(sourceDir)));
            Path targetRelativePath = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(file.getDestination()), PathUtils.getPathWithoutSchemeAndAuthority(new Path(destinationDir)));
            Assert.assertTrue(paths.contains(originRelativePath));
            Assert.assertTrue(paths.contains(targetRelativePath));
            Assert.assertEquals(originRelativePath, targetRelativePath);
        } else if (copyEntity instanceof PrePublishStep) {
            PrePublishStep pre = (PrePublishStep) copyEntity;
            Assert.assertTrue(pre.getStep() instanceof DeleteFileCommitStep);
            // need to delete this file
            Assert.assertTrue(pre.explain().indexOf("configBasedDatasetTest/dest/dir1/file1") > 0);
        } else if (copyEntity instanceof PostPublishStep) {
            PostPublishStep post = (PostPublishStep) copyEntity;
            Assert.assertTrue(post.getStep() instanceof WatermarkMetadataGenerationCommitStep);
            Assert.assertTrue(post.explain().indexOf("dest/_metadata") > 0 && post.explain().indexOf("" + sourceWatermark) > 0);
        } else {
            throw new Exception("Wrong type");
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) PrePublishStep(org.apache.gobblin.data.management.copy.entities.PrePublishStep) Test(org.testng.annotations.Test)

Example 9 with CopyEntity

use of org.apache.gobblin.data.management.copy.CopyEntity in project incubator-gobblin by apache.

the class ConfigBasedDataset method getCopyableFiles.

@Override
public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration copyConfiguration) throws IOException {
    List<CopyEntity> copyableFiles = Lists.newArrayList();
    EndPoint copyFromRaw = copyRoute.getCopyFrom();
    EndPoint copyToRaw = copyRoute.getCopyTo();
    if (!(copyFromRaw instanceof HadoopFsEndPoint && copyToRaw instanceof HadoopFsEndPoint)) {
        log.warn("Currently only handle the Hadoop Fs EndPoint replication");
        return copyableFiles;
    }
    // For {@link HadoopFsEndPoint}s, set pathfilter and applyFilterToDirectories
    HadoopFsEndPoint copyFrom = (HadoopFsEndPoint) copyFromRaw;
    HadoopFsEndPoint copyTo = (HadoopFsEndPoint) copyToRaw;
    copyFrom.setPathFilter(pathFilter);
    copyFrom.setApplyFilterToDirectories(applyFilterToDirectories);
    copyTo.setPathFilter(pathFilter);
    copyTo.setApplyFilterToDirectories(applyFilterToDirectories);
    if (this.watermarkEnabled) {
        if ((!copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent()) || (copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent() && copyFromRaw.getWatermark().get().compareTo(copyToRaw.getWatermark().get()) <= 0)) {
            log.info("No need to copy as destination watermark >= source watermark with source watermark {}, for dataset with metadata {}", copyFromRaw.getWatermark().isPresent() ? copyFromRaw.getWatermark().get().toJson() : "N/A", this.rc.getMetaData());
            return copyableFiles;
        }
    }
    Configuration conf = HadoopUtils.newConfiguration();
    FileSystem copyFromFs = FileSystem.get(copyFrom.getFsURI(), conf);
    FileSystem copyToFs = FileSystem.get(copyTo.getFsURI(), conf);
    Collection<FileStatus> allFilesInSource = copyFrom.getFiles();
    Collection<FileStatus> allFilesInTarget = copyTo.getFiles();
    Set<FileStatus> copyFromFileStatuses = Sets.newHashSet(allFilesInSource);
    Map<Path, FileStatus> copyToFileMap = Maps.newHashMap();
    for (FileStatus f : allFilesInTarget) {
        copyToFileMap.put(PathUtils.getPathWithoutSchemeAndAuthority(f.getPath()), f);
    }
    Collection<Path> deletedPaths = Lists.newArrayList();
    boolean watermarkMetadataCopied = false;
    boolean deleteTargetIfNotExistOnSource = rc.isDeleteTargetIfNotExistOnSource();
    for (FileStatus originFileStatus : copyFromFileStatuses) {
        Path relative = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(originFileStatus.getPath()), PathUtils.getPathWithoutSchemeAndAuthority(copyFrom.getDatasetPath()));
        // construct the new path in the target file system
        Path newPath = new Path(copyTo.getDatasetPath(), relative);
        if (relative.toString().equals(ReplicaHadoopFsEndPoint.WATERMARK_FILE)) {
            watermarkMetadataCopied = true;
        }
        // skip copy same file
        if (copyToFileMap.containsKey(newPath) && copyToFileMap.get(newPath).getLen() == originFileStatus.getLen() && copyToFileMap.get(newPath).getModificationTime() > originFileStatus.getModificationTime()) {
            log.debug("Copy from timestamp older than copy to timestamp, skipped copy {} for dataset with metadata {}", originFileStatus.getPath(), this.rc.getMetaData());
        } else {
            // need to remove those files in the target File System
            if (copyToFileMap.containsKey(newPath)) {
                deletedPaths.add(newPath);
            }
            CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(copyFromFs, originFileStatus, copyToFs.makeQualified(newPath), copyConfiguration).fileSet(PathUtils.getPathWithoutSchemeAndAuthority(copyTo.getDatasetPath()).toString()).build();
            copyableFile.setFsDatasets(copyFromFs, copyToFs);
            copyableFiles.add(copyableFile);
        }
        // clean up already checked paths
        copyToFileMap.remove(newPath);
    }
    // delete the paths on target directory if NOT exists on source
    if (deleteTargetIfNotExistOnSource) {
        deletedPaths.addAll(copyToFileMap.keySet());
    }
    // delete old files first
    if (!deletedPaths.isEmpty()) {
        DeleteFileCommitStep deleteCommitStep = DeleteFileCommitStep.fromPaths(copyToFs, deletedPaths, this.props);
        copyableFiles.add(new PrePublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), deleteCommitStep, 0));
    }
    // generate the watermark file even if watermark checking is disabled. Make sure it can come into functional once disired.
    if ((!watermarkMetadataCopied) && copyFrom.getWatermark().isPresent()) {
        copyableFiles.add(new PostPublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), new WatermarkMetadataGenerationCommitStep(copyTo.getFsURI().toString(), copyTo.getDatasetPath(), copyFrom.getWatermark().get()), 1));
    }
    return copyableFiles;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) FileSystem(org.apache.hadoop.fs.FileSystem) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) PrePublishStep(org.apache.gobblin.data.management.copy.entities.PrePublishStep)

Example 10 with CopyEntity

use of org.apache.gobblin.data.management.copy.CopyEntity in project incubator-gobblin by apache.

the class CopyDataPublisher method persistFailedFileSet.

private int persistFailedFileSet(Collection<? extends WorkUnitState> workUnitStates) throws IOException {
    int filesPersisted = 0;
    for (WorkUnitState wu : workUnitStates) {
        if (wu.getWorkingState() == WorkingState.SUCCESSFUL) {
            CopyEntity entity = CopySource.deserializeCopyEntity(wu);
            if (entity instanceof CopyableFile) {
                CopyableFile file = (CopyableFile) entity;
                Path outputDir = FileAwareInputStreamDataWriter.getOutputDir(wu);
                CopyableDatasetMetadata metadata = CopySource.deserializeCopyableDataset(wu);
                Path outputPath = FileAwareInputStreamDataWriter.getOutputFilePath(file, outputDir, file.getDatasetAndPartition(metadata));
                if (this.recoveryHelper.persistFile(wu, file, outputPath)) {
                    filesPersisted++;
                }
            }
        }
    }
    return filesPersisted;
}
Also used : Path(org.apache.hadoop.fs.Path) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) CopyableDatasetMetadata(org.apache.gobblin.data.management.copy.CopyableDatasetMetadata) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile)

Aggregations

CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)12 CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)8 Path (org.apache.hadoop.fs.Path)6 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)4 PostPublishStep (org.apache.gobblin.data.management.copy.entities.PostPublishStep)4 PrePublishStep (org.apache.gobblin.data.management.copy.entities.PrePublishStep)4 DeleteFileCommitStep (org.apache.gobblin.util.commit.DeleteFileCommitStep)4 CopyableDatasetMetadata (org.apache.gobblin.data.management.copy.CopyableDatasetMetadata)3 CommitStepCopyEntity (org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity)3 Test (org.testng.annotations.Test)3 IOException (java.io.IOException)2 State (org.apache.gobblin.configuration.State)2 MultiTimingEvent (org.apache.gobblin.metrics.event.MultiTimingEvent)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2 Table (org.apache.hadoop.hive.ql.metadata.Table)2 Closer (com.google.common.io.Closer)1 SuppressFBWarnings (edu.umd.cs.findbugs.annotations.SuppressFBWarnings)1 File (java.io.File)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1