Search in sources :

Example 61 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class WorstFitDecreasingBinPackingTest method getWorkUnitWithWeight.

public WorkUnit getWorkUnitWithWeight(long weight) {
    WorkUnit workUnit = new WorkUnit(new Extract(Extract.TableType.APPEND_ONLY, "", ""));
    workUnit.setProp(WEIGHT, Long.toString(weight));
    return workUnit;
}
Also used : Extract(org.apache.gobblin.source.workunit.Extract) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 62 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method testJobStateNotCopiedToWorkUnit.

@Test
public void testJobStateNotCopiedToWorkUnit() {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.prefix", PREFIX);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
    String dummyKey = "dummy.job.config";
    state.setProp(dummyKey, "dummy");
    List<WorkUnit> workunits = source.getWorkunits(state);
    Assert.assertEquals(workunits.size(), 4);
    for (WorkUnit wu : workunits) {
        if (wu instanceof MultiWorkUnit) {
            for (WorkUnit workUnit : ((MultiWorkUnit) wu).getWorkUnits()) {
                Assert.assertFalse(workUnit.contains(dummyKey));
            }
        } else {
            Assert.assertFalse(wu.contains(dummyKey));
        }
    }
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) DatePartitionedAvroFileSource(org.apache.gobblin.source.DatePartitionedAvroFileSource) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 63 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method testReadPartitionsByMinute.

@Test
public void testReadPartitionsByMinute() throws IOException, DataRecordException {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.prefix", PREFIX);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
    // Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory
    List<WorkUnit> workunits = source.getWorkunits(state);
    Assert.assertEquals(workunits.size(), 4);
    verifyWorkUnits(workunits);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) DatePartitionedAvroFileSource(org.apache.gobblin.source.DatePartitionedAvroFileSource) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 64 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method testReadPartitionsByMinuteWithLeadtime.

@Test
public void testReadPartitionsByMinuteWithLeadtime() throws IOException, DataRecordException {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.prefix", PREFIX);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
    state.setProp("date.partitioned.source.partition.lead_time.size", "3");
    state.setProp("date.partitioned.source.partition.lead_time.granularity", "HOUR");
    /*
     * Since lead time is 3 hours, only the first WorkUnit (which is 6 hours old, rest are 2hrs) should get
     * picked up
     */
    List<WorkUnit> workunits = source.getWorkunits(state);
    Assert.assertEquals(workunits.size(), 1);
    verifyWorkUnits(workunits, workunits.size());
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) DatePartitionedAvroFileSource(org.apache.gobblin.source.DatePartitionedAvroFileSource) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 65 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method testWorksNoPrefix.

@Test
public void testWorksNoPrefix() throws IOException, DataRecordException {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY + Path.SEPARATOR + PREFIX);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
    // Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory
    List<WorkUnit> workunits = source.getWorkunits(state);
    Assert.assertEquals(workunits.size(), 4);
    verifyWorkUnits(workunits);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) DatePartitionedAvroFileSource(org.apache.gobblin.source.DatePartitionedAvroFileSource) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Aggregations

WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)133 Test (org.testng.annotations.Test)59 SourceState (org.apache.gobblin.configuration.SourceState)40 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)40 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)35 Extract (org.apache.gobblin.source.workunit.Extract)24 Path (org.apache.hadoop.fs.Path)19 State (org.apache.gobblin.configuration.State)13 IOException (java.io.IOException)11 ArrayList (java.util.ArrayList)10 Closer (com.google.common.io.Closer)9 Properties (java.util.Properties)9 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)8 List (java.util.List)7 Table (org.apache.hadoop.hive.ql.metadata.Table)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Config (com.typesafe.config.Config)6 File (java.io.File)6 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6