Search in sources :

Example 1 with ComparableWatermark

use of org.apache.gobblin.source.extractor.ComparableWatermark in project incubator-gobblin by apache.

the class ReplicaHadoopFsEndPoint method getWatermark.

@Override
public synchronized Optional<ComparableWatermark> getWatermark() {
    if (this.watermarkInitialized) {
        return this.cachedWatermark;
    }
    this.watermarkInitialized = true;
    try {
        Path metaData = new Path(rc.getPath(), WATERMARK_FILE);
        FileSystem fs = FileSystem.get(rc.getFsURI(), new Configuration());
        if (fs.exists(metaData)) {
            try (FSDataInputStream fin = fs.open(metaData)) {
                InputStreamReader reader = new InputStreamReader(fin, Charsets.UTF_8);
                String content = CharStreams.toString(reader);
                Watermark w = WatermarkMetadataUtil.deserialize(content);
                if (w instanceof ComparableWatermark) {
                    this.cachedWatermark = Optional.of((ComparableWatermark) w);
                }
            }
            return this.cachedWatermark;
        }
        // for replica, can not use the file time stamp as that is different with original source time stamp
        return this.cachedWatermark;
    } catch (IOException e) {
        log.warn("Can not find " + WATERMARK_FILE + " for replica " + this);
        return this.cachedWatermark;
    } catch (WatermarkMetadataUtil.WatermarkMetadataMulFormatException e) {
        log.warn("Can not create watermark from " + WATERMARK_FILE + " for replica " + this);
        return this.cachedWatermark;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ComparableWatermark(org.apache.gobblin.source.extractor.ComparableWatermark) Configuration(org.apache.hadoop.conf.Configuration) InputStreamReader(java.io.InputStreamReader) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) IOException(java.io.IOException) Watermark(org.apache.gobblin.source.extractor.Watermark) ComparableWatermark(org.apache.gobblin.source.extractor.ComparableWatermark)

Example 2 with ComparableWatermark

use of org.apache.gobblin.source.extractor.ComparableWatermark in project incubator-gobblin by apache.

the class CopyRouteGeneratorTest method testCopyRouteGenerator.

@Test
public void testCopyRouteGenerator() throws Exception {
    // Oct 1, 2016
    long replica1Watermark = 1475304606000L;
    // Oct 4, 2016
    long sourceWatermark = 1475604606000L;
    ReplicaHadoopFsEndPoint notAvailableReplica = Mockito.mock(ReplicaHadoopFsEndPoint.class);
    Mockito.when(notAvailableReplica.isFileSystemAvailable()).thenReturn(false);
    Optional<ComparableWatermark> tmp = Optional.absent();
    Mockito.when(notAvailableReplica.getWatermark()).thenReturn(tmp);
    ReplicaHadoopFsEndPoint replica1 = Mockito.mock(ReplicaHadoopFsEndPoint.class);
    Mockito.when(replica1.isFileSystemAvailable()).thenReturn(true);
    ComparableWatermark cw = new LongWatermark(replica1Watermark);
    tmp = Optional.of(cw);
    Mockito.when(replica1.getWatermark()).thenReturn(tmp);
    SourceHadoopFsEndPoint source = Mockito.mock(SourceHadoopFsEndPoint.class);
    Mockito.when(source.isFileSystemAvailable()).thenReturn(true);
    cw = new LongWatermark(sourceWatermark);
    tmp = Optional.of(cw);
    Mockito.when(source.getWatermark()).thenReturn(tmp);
    ReplicaHadoopFsEndPoint copyToEndPoint = Mockito.mock(ReplicaHadoopFsEndPoint.class);
    Mockito.when(copyToEndPoint.isFileSystemAvailable()).thenReturn(true);
    CopyRoute cp1 = new CopyRoute(notAvailableReplica, copyToEndPoint);
    CopyRoute cp2 = new CopyRoute(replica1, copyToEndPoint);
    CopyRoute cp3 = new CopyRoute(source, copyToEndPoint);
    DataFlowTopology.DataFlowPath dataFlowPath = new DataFlowTopology.DataFlowPath(ImmutableList.<CopyRoute>of(cp1, cp2, cp3));
    DataFlowTopology dataFlowTopology = new DataFlowTopology();
    dataFlowTopology.addDataFlowPath(dataFlowPath);
    ReplicationConfiguration rc = Mockito.mock(ReplicationConfiguration.class);
    Mockito.when(rc.getCopyMode()).thenReturn(ReplicationCopyMode.PULL);
    Mockito.when(rc.getSource()).thenReturn(source);
    Mockito.when(rc.getReplicas()).thenReturn(ImmutableList.<EndPoint>of(notAvailableReplica, replica1, copyToEndPoint));
    Mockito.when(rc.getDataFlowToplogy()).thenReturn(dataFlowTopology);
    CopyRouteGeneratorOptimizedNetworkBandwidthForTest network = new CopyRouteGeneratorOptimizedNetworkBandwidthForTest();
    Assert.assertTrue(network.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().equals(replica1));
    Assert.assertTrue(network.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().getWatermark().get().compareTo(new LongWatermark(replica1Watermark)) == 0);
    CopyRouteGeneratorOptimizedLatency latency = new CopyRouteGeneratorOptimizedLatency();
    Assert.assertTrue(latency.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().equals(source));
    Assert.assertTrue(latency.getPullRoute(rc, copyToEndPoint).get().getCopyFrom().getWatermark().get().compareTo(new LongWatermark(sourceWatermark)) == 0);
}
Also used : ComparableWatermark(org.apache.gobblin.source.extractor.ComparableWatermark) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 3 with ComparableWatermark

use of org.apache.gobblin.source.extractor.ComparableWatermark in project incubator-gobblin by apache.

the class ConfigBasedDatasetTest method testGetCopyableFilesHelper.

public Collection<? extends CopyEntity> testGetCopyableFilesHelper(String sourceDir, String destinationDir, long sourceWatermark, boolean isFilterEnabled) throws Exception {
    FileSystem localFs = FileSystem.getLocal(new Configuration());
    URI local = localFs.getUri();
    Properties properties = new Properties();
    properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/publisher");
    PathFilter pathFilter = DatasetUtils.instantiatePathFilter(properties);
    boolean applyFilterToDirectories = false;
    if (isFilterEnabled) {
        properties.setProperty(DatasetUtils.CONFIGURATION_KEY_PREFIX + "path.filter.class", "org.apache.gobblin.util.filters.HiddenFilter");
        properties.setProperty(CopyConfiguration.APPLY_FILTER_TO_DIRECTORIES, "true");
        pathFilter = DatasetUtils.instantiatePathFilter(properties);
        applyFilterToDirectories = Boolean.parseBoolean(properties.getProperty(CopyConfiguration.APPLY_FILTER_TO_DIRECTORIES, "false"));
    }
    CopyConfiguration copyConfiguration = CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).publishDir(new Path(destinationDir)).preserve(PreserveAttributes.fromMnemonicString("ugp")).build();
    ReplicationMetaData mockMetaData = Mockito.mock(ReplicationMetaData.class);
    Mockito.when(mockMetaData.toString()).thenReturn("Mock Meta Data");
    ReplicationConfiguration mockRC = Mockito.mock(ReplicationConfiguration.class);
    Mockito.when(mockRC.getCopyMode()).thenReturn(ReplicationCopyMode.PULL);
    Mockito.when(mockRC.getMetaData()).thenReturn(mockMetaData);
    HadoopFsEndPoint copyFrom = Mockito.mock(HadoopFsEndPoint.class);
    Mockito.when(copyFrom.getDatasetPath()).thenReturn(new Path(sourceDir));
    Mockito.when(copyFrom.getFsURI()).thenReturn(local);
    ComparableWatermark sw = new LongWatermark(sourceWatermark);
    Mockito.when(copyFrom.getWatermark()).thenReturn(Optional.of(sw));
    Mockito.when(copyFrom.getFiles()).thenReturn(FileListUtils.listFilesRecursively(localFs, new Path(sourceDir), pathFilter, applyFilterToDirectories));
    HadoopFsEndPoint copyTo = Mockito.mock(HadoopFsEndPoint.class);
    Mockito.when(copyTo.getDatasetPath()).thenReturn(new Path(destinationDir));
    Mockito.when(copyTo.getFsURI()).thenReturn(local);
    Optional<ComparableWatermark> tmp = Optional.absent();
    Mockito.when(copyTo.getWatermark()).thenReturn(tmp);
    Mockito.when(copyTo.getFiles()).thenReturn(FileListUtils.listFilesRecursively(localFs, new Path(destinationDir), pathFilter, applyFilterToDirectories));
    CopyRoute route = Mockito.mock(CopyRoute.class);
    Mockito.when(route.getCopyFrom()).thenReturn(copyFrom);
    Mockito.when(route.getCopyTo()).thenReturn(copyTo);
    ConfigBasedDataset dataset = new ConfigBasedDataset(mockRC, properties, route);
    Collection<? extends CopyEntity> copyableFiles = dataset.getCopyableFiles(localFs, copyConfiguration);
    return copyableFiles;
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) Configuration(org.apache.hadoop.conf.Configuration) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) Properties(java.util.Properties) URI(java.net.URI) ComparableWatermark(org.apache.gobblin.source.extractor.ComparableWatermark) FileSystem(org.apache.hadoop.fs.FileSystem) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 4 with ComparableWatermark

use of org.apache.gobblin.source.extractor.ComparableWatermark in project incubator-gobblin by apache.

the class SourceHadoopFsEndPoint method getWatermark.

@Override
public synchronized Optional<ComparableWatermark> getWatermark() {
    if (this.initialized) {
        return this.cachedWatermark;
    }
    try {
        long curTs = -1;
        FileSystem fs = FileSystem.get(rc.getFsURI(), new Configuration());
        Collection<Path> validPaths = ReplicationDataValidPathPicker.getValidPaths(this);
        for (Path p : validPaths) {
            this.allFileStatus.addAll(FileListUtils.listFilesRecursively(fs, p, super.getPathFilter(), super.isApplyFilterToDirectories()));
        }
        for (FileStatus f : this.allFileStatus) {
            if (f.getModificationTime() > curTs) {
                curTs = f.getModificationTime();
            }
        }
        ComparableWatermark result = new LongWatermark(curTs);
        this.cachedWatermark = Optional.of(result);
        if (this.cachedWatermark.isPresent()) {
            this.initialized = true;
        }
        return this.cachedWatermark;
    } catch (IOException e) {
        log.error("Error while retrieve the watermark for " + this);
        return this.cachedWatermark;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ComparableWatermark(org.apache.gobblin.source.extractor.ComparableWatermark) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) IOException(java.io.IOException) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Aggregations

ComparableWatermark (org.apache.gobblin.source.extractor.ComparableWatermark)4 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)3 Configuration (org.apache.hadoop.conf.Configuration)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 Path (org.apache.hadoop.fs.Path)3 IOException (java.io.IOException)2 InputStreamReader (java.io.InputStreamReader)1 URI (java.net.URI)1 Properties (java.util.Properties)1 CopyConfiguration (org.apache.gobblin.data.management.copy.CopyConfiguration)1 Watermark (org.apache.gobblin.source.extractor.Watermark)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 PathFilter (org.apache.hadoop.fs.PathFilter)1 Test (org.testng.annotations.Test)1