Search in sources :

Example 21 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class CopySourceTest method testCopySource.

@Test
public void testCopySource() throws Exception {
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
    state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///");
    state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir");
    state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, TestCopyableDatasetFinder.class.getName());
    CopySource source = new CopySource();
    List<WorkUnit> workunits = source.getWorkunits(state);
    workunits = JobLauncherUtils.flattenWorkUnits(workunits);
    Assert.assertEquals(workunits.size(), TestCopyableDataset.FILE_COUNT);
    Extract extract = workunits.get(0).getExtract();
    for (WorkUnit workUnit : workunits) {
        CopyableFile file = (CopyableFile) CopySource.deserializeCopyEntity(workUnit);
        Assert.assertTrue(file.getOrigin().getPath().toString().startsWith(TestCopyableDataset.ORIGIN_PREFIX));
        Assert.assertEquals(file.getDestinationOwnerAndPermission(), TestCopyableDataset.OWNER_AND_PERMISSION);
        Assert.assertEquals(workUnit.getExtract(), extract);
    }
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 22 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class CopySourceTest method testSubmitUnfulfilledRequestEvents.

@Test
public void testSubmitUnfulfilledRequestEvents() throws IOException, NoSuchMethodException, InvocationTargetException, IllegalAccessException {
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
    state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///");
    state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir");
    state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, TestCopyablePartitionableDatasedFinder.class.getCanonicalName());
    state.setProp(CopySource.MAX_CONCURRENT_LISTING_SERVICES, 2);
    state.setProp(CopyConfiguration.MAX_COPY_PREFIX + ".size", "50");
    state.setProp(CopyConfiguration.MAX_COPY_PREFIX + ".copyEntities", 2);
    state.setProp(CopyConfiguration.STORE_REJECTED_REQUESTS_KEY, RequestAllocatorConfig.StoreRejectedRequestsConfig.ALL.name().toLowerCase());
    state.setProp(ConfigurationKeys.METRICS_CUSTOM_BUILDERS, "org.apache.gobblin.metrics.ConsoleEventReporterFactory");
    CopySource source = new CopySource();
    final FileSystem sourceFs = HadoopUtils.getSourceFileSystem(state);
    final FileSystem targetFs = HadoopUtils.getWriterFileSystem(state, 1, 0);
    int maxThreads = state.getPropAsInt(CopySource.MAX_CONCURRENT_LISTING_SERVICES, CopySource.DEFAULT_MAX_CONCURRENT_LISTING_SERVICES);
    final CopyConfiguration copyConfiguration = CopyConfiguration.builder(targetFs, state.getProperties()).build();
    MetricContext metricContext = Instrumented.getMetricContext(state, CopySource.class);
    EventSubmitter eventSubmitter = new EventSubmitter.Builder(metricContext, CopyConfiguration.COPY_PREFIX).build();
    DatasetsFinder<CopyableDatasetBase> datasetFinder = DatasetUtils.instantiateDatasetFinder(state.getProperties(), sourceFs, CopySource.DEFAULT_DATASET_PROFILE_CLASS_KEY, eventSubmitter, state);
    IterableDatasetFinder<CopyableDatasetBase> iterableDatasetFinder = datasetFinder instanceof IterableDatasetFinder ? (IterableDatasetFinder<CopyableDatasetBase>) datasetFinder : new IterableDatasetFinderImpl<>(datasetFinder);
    Iterator<CopyableDatasetRequestor> requestorIteratorWithNulls = Iterators.transform(iterableDatasetFinder.getDatasetsIterator(), new CopyableDatasetRequestor.Factory(targetFs, copyConfiguration, log));
    Iterator<CopyableDatasetRequestor> requestorIterator = Iterators.filter(requestorIteratorWithNulls, Predicates.<CopyableDatasetRequestor>notNull());
    Method m = CopySource.class.getDeclaredMethod("createRequestAllocator", CopyConfiguration.class, int.class);
    m.setAccessible(true);
    PriorityIterableBasedRequestAllocator<FileSet<CopyEntity>> allocator = (PriorityIterableBasedRequestAllocator<FileSet<CopyEntity>>) m.invoke(source, copyConfiguration, maxThreads);
    Iterator<FileSet<CopyEntity>> prioritizedFileSets = allocator.allocateRequests(requestorIterator, copyConfiguration.getMaxToCopy());
    List<FileSet<CopyEntity>> fileSetList = allocator.getRequestsExceedingAvailableResourcePool();
    Assert.assertEquals(fileSetList.size(), 2);
    FileSet<CopyEntity> fileSet = fileSetList.get(0);
    Assert.assertEquals(fileSet.getDataset().getUrn(), "/test");
    Assert.assertEquals(fileSet.getTotalEntities(), 5);
    Assert.assertEquals(fileSet.getTotalSizeInBytes(), 50);
    fileSet = fileSetList.get(1);
    Assert.assertEquals(fileSet.getDataset().getUrn(), "/test");
    Assert.assertEquals(fileSet.getTotalEntities(), 5);
    Assert.assertEquals(fileSet.getTotalSizeInBytes(), 50);
}
Also used : IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) MetricContext(org.apache.gobblin.metrics.MetricContext) FileSystem(org.apache.hadoop.fs.FileSystem) CopyableDatasetRequestor(org.apache.gobblin.data.management.partition.CopyableDatasetRequestor) SourceState(org.apache.gobblin.configuration.SourceState) FileSet(org.apache.gobblin.data.management.partition.FileSet) PriorityIterableBasedRequestAllocator(org.apache.gobblin.util.request_allocation.PriorityIterableBasedRequestAllocator) EventSubmitter(org.apache.gobblin.metrics.event.EventSubmitter) Method(java.lang.reflect.Method) Test(org.testng.annotations.Test)

Example 23 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class BackfillHiveSourceTest method testWhitelist.

@Test
public void testWhitelist() throws Exception {
    BackfillHiveSource backfillHiveSource = new BackfillHiveSource();
    SourceState state = new SourceState();
    state.setProp(BackfillHiveSource.BACKFILL_SOURCE_PARTITION_WHITELIST_KEY, "service@logEvent@datepartition=2016-08-04-00,service@logEvent@datepartition=2016-08-05-00");
    backfillHiveSource.initBackfillHiveSource(state);
    Partition pass1 = Mockito.mock(Partition.class, Mockito.RETURNS_SMART_NULLS);
    Mockito.when(pass1.getCompleteName()).thenReturn("service@logEvent@datepartition=2016-08-04-00");
    Partition pass2 = Mockito.mock(Partition.class, Mockito.RETURNS_SMART_NULLS);
    Mockito.when(pass2.getCompleteName()).thenReturn("service@logEvent@datepartition=2016-08-05-00");
    Partition fail = Mockito.mock(Partition.class, Mockito.RETURNS_SMART_NULLS);
    Mockito.when(fail.getCompleteName()).thenReturn("service@logEvent@datepartition=2016-08-06-00");
    Assert.assertTrue(backfillHiveSource.shouldCreateWorkunit(pass1, new LongWatermark(0)));
    Assert.assertTrue(backfillHiveSource.shouldCreateWorkunit(pass2, new LongWatermark(0)));
    Assert.assertFalse(backfillHiveSource.shouldCreateWorkunit(fail, new LongWatermark(0)));
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) SourceState(org.apache.gobblin.configuration.SourceState) BackfillHiveSource(org.apache.gobblin.data.management.conversion.hive.source.BackfillHiveSource) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 24 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class HiveSourceTest method getTestState.

private static SourceState getTestState(String dbName) {
    SourceState testState = new SourceState();
    testState.setProp("hive.dataset.database", dbName);
    testState.setProp("hive.dataset.table.pattern", "*");
    testState.setProp(ConfigurationKeys.JOB_ID_KEY, "testJobId");
    return testState;
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState)

Example 25 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class HiveSourceTest method testGetWorkUnitsForTable.

@Test
public void testGetWorkUnitsForTable() throws Exception {
    String dbName = "testdb2";
    String tableName = "testtable2";
    String tableSdLoc = "/tmp/testtable2";
    this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true);
    SourceState testState = getTestState(dbName);
    this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName, tableSdLoc, Optional.<String>absent());
    List<WorkUnit> workUnits = hiveSource.getWorkunits(testState);
    // One workunit for the table, no dummy workunits
    Assert.assertEquals(workUnits.size(), 1);
    WorkUnit wu = workUnits.get(0);
    HiveWorkUnit hwu = new HiveWorkUnit(wu);
    Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getDb(), dbName);
    Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getTable(), tableName);
    Assert.assertEquals(hwu.getTableSchemaUrl(), new Path("/tmp/dummy"));
}
Also used : Path(org.apache.hadoop.fs.Path) SourceState(org.apache.gobblin.configuration.SourceState) HiveWorkUnit(org.apache.gobblin.data.management.conversion.hive.source.HiveWorkUnit) HiveWorkUnit(org.apache.gobblin.data.management.conversion.hive.source.HiveWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Aggregations

SourceState (org.apache.gobblin.configuration.SourceState)90 Test (org.testng.annotations.Test)76 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)44 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)38 State (org.apache.gobblin.configuration.State)30 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)11 Partition (org.apache.hadoop.hive.ql.metadata.Partition)8 Table (org.apache.hadoop.hive.ql.metadata.Table)8 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)7 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 Extract (org.apache.gobblin.source.workunit.Extract)7 DateTime (org.joda.time.DateTime)7 Dataset (org.apache.gobblin.dataset.Dataset)6 PartitionableDataset (org.apache.gobblin.dataset.PartitionableDataset)6 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6 IOException (java.io.IOException)5 Path (org.apache.hadoop.fs.Path)5 Gson (com.google.gson.Gson)4 JsonObject (com.google.gson.JsonObject)4