use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class DatePartitionedAvroFileExtractorTest method testReadPartitionsByMinuteWithLeadtime.
@Test
public void testReadPartitionsByMinuteWithLeadtime() throws IOException, DataRecordException {
DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
SourceState state = new SourceState();
state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
state.setProp("date.partitioned.source.partition.prefix", PREFIX);
state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
state.setProp("date.partitioned.source.partition.lead_time.size", "3");
state.setProp("date.partitioned.source.partition.lead_time.granularity", "HOUR");
/*
* Since lead time is 3 hours, only the first WorkUnit (which is 6 hours old, rest are 2hrs) should get
* picked up
*/
List<WorkUnit> workunits = source.getWorkunits(state);
Assert.assertEquals(workunits.size(), 1);
verifyWorkUnits(workunits, workunits.size());
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class DatePartitionedAvroFileExtractorTest method testWorksNoPrefix.
@Test
public void testWorksNoPrefix() throws IOException, DataRecordException {
DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
SourceState state = new SourceState();
state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY + Path.SEPARATOR + PREFIX);
state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
// Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory
List<WorkUnit> workunits = source.getWorkunits(state);
Assert.assertEquals(workunits.size(), 4);
verifyWorkUnits(workunits);
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class FileBasedSourceTest method numberOfWorkUnits.
@Test
void numberOfWorkUnits() throws IOException {
SourceState sourceState = new SourceState();
DatePartitionedJsonFileSource source = new DatePartitionedJsonFileSource();
initState(sourceState);
List<WorkUnit> workUnits = source.getWorkunits(sourceState);
Assert.assertEquals(3, workUnits.size());
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class FileBasedSourceTest method testFailJobWhenPreviousStateExistsButDoesNotHaveSnapshot.
@Test
public void testFailJobWhenPreviousStateExistsButDoesNotHaveSnapshot() {
try {
DummyFileBasedSource source = new DummyFileBasedSource();
WorkUnitState workUnitState = new WorkUnitState();
workUnitState.setId("priorState");
List<WorkUnitState> workUnitStates = Lists.newArrayList(workUnitState);
State state = new State();
state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, Extract.TableType.SNAPSHOT_ONLY.toString());
state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_PRIOR_SNAPSHOT_REQUIRED, true);
SourceState sourceState = new SourceState(state, workUnitStates);
source.getWorkunits(sourceState);
Assert.fail("Expected RuntimeException, but no exceptions were thrown.");
} catch (RuntimeException e) {
Assert.assertEquals("No 'source.filebased.fs.snapshot' found on state of prior job", e.getMessage());
}
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class HadoopFsHelperTest method testGetFileStreamSucceedsWithGZIPFile.
@Test
public void testGetFileStreamSucceedsWithGZIPFile() throws FileBasedHelperException, IOException {
SourceState sourceState = new SourceState();
URL rootUrl = getClass().getResource("/source/");
String rootPath = rootUrl.toString();
sourceState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, rootPath);
HadoopFsHelper fsHelper = new HadoopFsHelper(sourceState);
fsHelper.connect();
URL url = getClass().getResource("/source/simple.tsv.gz");
String path = url.toString();
InputStream in = fsHelper.getFileStream(path);
String contents = IOUtils.toString(in, "UTF-8");
Assert.assertEquals(contents, "A\t1\nB\t2\n");
}
Aggregations