Search in sources :

Example 1 with DatePartitionedAvroFileSource

use of org.apache.gobblin.source.DatePartitionedAvroFileSource in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method testJobStateNotCopiedToWorkUnit.

@Test
public void testJobStateNotCopiedToWorkUnit() {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.prefix", PREFIX);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
    String dummyKey = "dummy.job.config";
    state.setProp(dummyKey, "dummy");
    List<WorkUnit> workunits = source.getWorkunits(state);
    Assert.assertEquals(workunits.size(), 4);
    for (WorkUnit wu : workunits) {
        if (wu instanceof MultiWorkUnit) {
            for (WorkUnit workUnit : ((MultiWorkUnit) wu).getWorkUnits()) {
                Assert.assertFalse(workUnit.contains(dummyKey));
            }
        } else {
            Assert.assertFalse(wu.contains(dummyKey));
        }
    }
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) DatePartitionedAvroFileSource(org.apache.gobblin.source.DatePartitionedAvroFileSource) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 2 with DatePartitionedAvroFileSource

use of org.apache.gobblin.source.DatePartitionedAvroFileSource in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method testReadPartitionsByMinute.

@Test
public void testReadPartitionsByMinute() throws IOException, DataRecordException {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.prefix", PREFIX);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
    // Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory
    List<WorkUnit> workunits = source.getWorkunits(state);
    Assert.assertEquals(workunits.size(), 4);
    verifyWorkUnits(workunits);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) DatePartitionedAvroFileSource(org.apache.gobblin.source.DatePartitionedAvroFileSource) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 3 with DatePartitionedAvroFileSource

use of org.apache.gobblin.source.DatePartitionedAvroFileSource in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method testReadPartitionsByMinuteWithLeadtime.

@Test
public void testReadPartitionsByMinuteWithLeadtime() throws IOException, DataRecordException {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.prefix", PREFIX);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
    state.setProp("date.partitioned.source.partition.lead_time.size", "3");
    state.setProp("date.partitioned.source.partition.lead_time.granularity", "HOUR");
    /*
     * Since lead time is 3 hours, only the first WorkUnit (which is 6 hours old, rest are 2hrs) should get
     * picked up
     */
    List<WorkUnit> workunits = source.getWorkunits(state);
    Assert.assertEquals(workunits.size(), 1);
    verifyWorkUnits(workunits, workunits.size());
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) DatePartitionedAvroFileSource(org.apache.gobblin.source.DatePartitionedAvroFileSource) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 4 with DatePartitionedAvroFileSource

use of org.apache.gobblin.source.DatePartitionedAvroFileSource in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method testWorksNoPrefix.

@Test
public void testWorksNoPrefix() throws IOException, DataRecordException {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY + Path.SEPARATOR + PREFIX);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
    // Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory
    List<WorkUnit> workunits = source.getWorkunits(state);
    Assert.assertEquals(workunits.size(), 4);
    verifyWorkUnits(workunits);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) DatePartitionedAvroFileSource(org.apache.gobblin.source.DatePartitionedAvroFileSource) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Aggregations

SourceState (org.apache.gobblin.configuration.SourceState)4 DatePartitionedAvroFileSource (org.apache.gobblin.source.DatePartitionedAvroFileSource)4 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)4 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)4 Test (org.testng.annotations.Test)4