Search in sources :

Example 36 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class DatasetUtilsTest method testGetDatasetSpecificState.

@Test
public void testGetDatasetSpecificState() {
    String dataset1 = "testDataset1";
    String dataset2 = "testDataset2";
    String dataset3 = "testDataset3";
    String testKey1 = "testKey1";
    String testValue1 = "testValue1";
    SourceState state = new SourceState();
    state.setProp(DatasetUtils.DATASET_SPECIFIC_PROPS, "[{\"dataset\" : \"" + dataset1 + "\", \"" + testKey1 + "\" : \"" + testValue1 + "\"}, {\"dataset\" : \"" + dataset2 + "\", \"" + testKey1 + "\" : \"" + testValue1 + "\"}]");
    Map<String, State> datasetSpecificStateMap = DatasetUtils.getDatasetSpecificProps(Lists.newArrayList(dataset1, dataset3), state);
    State dataset1ExpectedState = new State();
    dataset1ExpectedState.setProp(testKey1, testValue1);
    Assert.assertEquals(datasetSpecificStateMap.get(dataset1), dataset1ExpectedState);
    Assert.assertNull(datasetSpecificStateMap.get(dataset2));
    Assert.assertNull(datasetSpecificStateMap.get(dataset3));
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) SourceState(org.apache.gobblin.configuration.SourceState) State(org.apache.gobblin.configuration.State) Test(org.testng.annotations.Test)

Example 37 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class DatasetUtilsTest method testGetDatasetSpecificStateWithRegex.

@Test
public void testGetDatasetSpecificStateWithRegex() {
    String dataset1 = "testDataset1";
    String dataset2 = "testDataset2";
    String dataset3 = "otherTestDataset1";
    String testKey1 = "testKey1";
    String testValue1 = "testValue1";
    SourceState state = new SourceState();
    state.setProp(DatasetUtils.DATASET_SPECIFIC_PROPS, "[{\"dataset\" : \"testDataset.*\", \"" + testKey1 + "\" : \"" + testValue1 + "\"}]");
    Map<String, State> datasetSpecificStateMap = DatasetUtils.getDatasetSpecificProps(Lists.newArrayList(dataset1, dataset2, dataset3), state);
    State dataset1ExpectedState = new State();
    dataset1ExpectedState.setProp(testKey1, testValue1);
    State dataset2ExpectedState = new State();
    dataset2ExpectedState.setProp(testKey1, testValue1);
    Assert.assertEquals(datasetSpecificStateMap.get(dataset1), dataset1ExpectedState);
    Assert.assertEquals(datasetSpecificStateMap.get(dataset2), dataset2ExpectedState);
    Assert.assertNull(datasetSpecificStateMap.get(dataset3));
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) SourceState(org.apache.gobblin.configuration.SourceState) State(org.apache.gobblin.configuration.State) Test(org.testng.annotations.Test)

Example 38 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class RegexBasedPartitionedRetrieverTest method testLeadtime.

@Test
public void testLeadtime() throws IOException {
    String snapshotRegex = "(\\d+)-PT-\\d+";
    RegexBasedPartitionedRetriever r = new RegexBasedPartitionedRetriever("txt");
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, tempDir.toString());
    state.setProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN, snapshotRegex);
    state.setProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_LEAD_TIME_GRANULARITY, "DAY");
    state.setProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_LEAD_TIME, "1");
    r.init(state);
    List<PartitionAwareFileRetriever.FileInfo> files = r.getFilesToProcess(DateToUse.APR_3_2017.getValue() - 1, 9999);
    Assert.assertEquals(files.size(), 2);
    verifyFile(files.get(0), DateToUse.APR_3_2017.getValue());
    verifyFile(files.get(1), DateToUse.MAY_1_2017.getValue());
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Test(org.testng.annotations.Test)

Example 39 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method testJobStateNotCopiedToWorkUnit.

@Test
public void testJobStateNotCopiedToWorkUnit() {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.prefix", PREFIX);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
    String dummyKey = "dummy.job.config";
    state.setProp(dummyKey, "dummy");
    List<WorkUnit> workunits = source.getWorkunits(state);
    Assert.assertEquals(workunits.size(), 4);
    for (WorkUnit wu : workunits) {
        if (wu instanceof MultiWorkUnit) {
            for (WorkUnit workUnit : ((MultiWorkUnit) wu).getWorkUnits()) {
                Assert.assertFalse(workUnit.contains(dummyKey));
            }
        } else {
            Assert.assertFalse(wu.contains(dummyKey));
        }
    }
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) DatePartitionedAvroFileSource(org.apache.gobblin.source.DatePartitionedAvroFileSource) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 40 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method testReadPartitionsByMinute.

@Test
public void testReadPartitionsByMinute() throws IOException, DataRecordException {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.prefix", PREFIX);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
    // Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory
    List<WorkUnit> workunits = source.getWorkunits(state);
    Assert.assertEquals(workunits.size(), 4);
    verifyWorkUnits(workunits);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) DatePartitionedAvroFileSource(org.apache.gobblin.source.DatePartitionedAvroFileSource) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Aggregations

SourceState (org.apache.gobblin.configuration.SourceState)90 Test (org.testng.annotations.Test)76 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)44 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)38 State (org.apache.gobblin.configuration.State)30 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)11 Partition (org.apache.hadoop.hive.ql.metadata.Partition)8 Table (org.apache.hadoop.hive.ql.metadata.Table)8 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)7 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 Extract (org.apache.gobblin.source.workunit.Extract)7 DateTime (org.joda.time.DateTime)7 Dataset (org.apache.gobblin.dataset.Dataset)6 PartitionableDataset (org.apache.gobblin.dataset.PartitionableDataset)6 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6 IOException (java.io.IOException)5 Path (org.apache.hadoop.fs.Path)5 Gson (com.google.gson.Gson)4 JsonObject (com.google.gson.JsonObject)4