use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HadoopFileInputSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
try {
Job job = Job.getInstance(new Configuration());
if (state.contains(FILE_INPUT_PATHS_KEY)) {
for (String inputPath : state.getPropAsList(FILE_INPUT_PATHS_KEY)) {
FileInputFormat.addInputPath(job, new Path(inputPath));
}
}
FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, job.getConfiguration());
List<InputSplit> fileSplits = fileInputFormat.getSplits(job);
if (fileSplits == null || fileSplits.isEmpty()) {
return ImmutableList.of();
}
Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null;
String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.size());
for (InputSplit inputSplit : fileSplits) {
// Create one WorkUnit per InputSplit
FileSplit fileSplit = (FileSplit) inputSplit;
Extract extract = createExtract(tableType, tableNamespace, tableName);
WorkUnit workUnit = WorkUnit.create(extract);
workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
workUnits.add(workUnit);
}
return workUnits;
} catch (IOException ioe) {
throw new RuntimeException("Failed to get workunits", ioe);
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class OldApiHadoopFileInputSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
JobConf jobConf = new JobConf(new Configuration());
for (String key : state.getPropertyNames()) {
jobConf.set(key, state.getProp(key));
}
if (state.contains(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) {
for (String inputPath : state.getPropAsList(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) {
FileInputFormat.addInputPath(jobConf, new Path(inputPath));
}
}
try {
FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, jobConf);
InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf, state.getPropAsInt(HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY, HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED));
if (fileSplits == null || fileSplits.length == 0) {
return ImmutableList.of();
}
Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null;
String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.length);
for (InputSplit inputSplit : fileSplits) {
// Create one WorkUnit per InputSplit
FileSplit fileSplit = (FileSplit) inputSplit;
Extract extract = createExtract(tableType, tableNamespace, tableName);
WorkUnit workUnit = WorkUnit.create(extract);
workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
workUnits.add(workUnit);
}
return workUnits;
} catch (IOException ioe) {
throw new RuntimeException("Failed to get workunits", ioe);
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HiveSerDeTest method testAvroOrcSerDes.
/**
* This test uses Avro SerDe to deserialize data from Avro files, and use ORC SerDe
* to serialize them into ORC files.
*/
@Test(groups = { "gobblin.serde" })
public void testAvroOrcSerDes() throws IOException, DataRecordException, DataConversionException {
Properties properties = new Properties();
properties.load(new FileReader("gobblin-core/src/test/resources/serde/serde.properties"));
SourceState sourceState = new SourceState(new State(properties), ImmutableList.<WorkUnitState>of());
OldApiWritableFileSource source = new OldApiWritableFileSource();
List<WorkUnit> workUnits = source.getWorkunits(sourceState);
Assert.assertEquals(workUnits.size(), 1);
WorkUnitState wus = new WorkUnitState(workUnits.get(0));
wus.addAll(sourceState);
Closer closer = Closer.create();
HiveWritableHdfsDataWriter writer = null;
try {
OldApiWritableFileExtractor extractor = closer.register((OldApiWritableFileExtractor) source.getExtractor(wus));
HiveSerDeConverter converter = closer.register(new HiveSerDeConverter());
writer = closer.register((HiveWritableHdfsDataWriter) new HiveWritableHdfsDataWriterBuilder<>().withBranches(1).withWriterId("0").writeTo(Destination.of(DestinationType.HDFS, sourceState)).writeInFormat(WriterOutputFormat.ORC).build());
converter.init(wus);
Writable record;
while ((record = extractor.readRecord(null)) != null) {
Iterable<Writable> convertedRecordIterable = converter.convertRecordImpl(null, record, wus);
Assert.assertEquals(Iterators.size(convertedRecordIterable.iterator()), 1);
writer.write(convertedRecordIterable.iterator().next());
}
} catch (Throwable t) {
throw closer.rethrow(t);
} finally {
closer.close();
if (writer != null) {
writer.commit();
}
Assert.assertTrue(this.fs.exists(new Path(sourceState.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR), sourceState.getProp(ConfigurationKeys.WRITER_FILE_NAME))));
HadoopUtils.deletePath(this.fs, new Path(sourceState.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR)), true);
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class QueryBasedExtractorTest method testDataPullUpperBoundsRemovedInLastWorkUnit.
@Test
public void testDataPullUpperBoundsRemovedInLastWorkUnit() {
int totalCount = 5;
ArrayList<DataRecord> records = this.generateRecords(totalCount);
WorkUnit workUnit = WorkUnit.createEmpty();
workUnit.setProp(Partition.IS_LAST_PARTIITON, true);
workUnit.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT");
WorkUnitState workUnitState = new WorkUnitState(workUnit, new State());
workUnitState.setId("testDataPullUpperBoundsRemovedInLastWorkUnit");
TestQueryBasedExtractor testExtractor = new TestQueryBasedExtractor(workUnitState, records);
testExtractor.setRangePredicates(1, 3);
this.verify(testExtractor, totalCount);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class QueryBasedExtractorTest method testDataPullUpperBoundsNotRemovedInLastWorkUnit.
@Test
public void testDataPullUpperBoundsNotRemovedInLastWorkUnit() {
int totalCount = 5;
ArrayList<DataRecord> records = this.generateRecords(totalCount);
WorkUnit workUnit = WorkUnit.createEmpty();
WorkUnitState workUnitState = new WorkUnitState(workUnit, new State());
workUnitState.setId("testDataPullUpperBoundsNotRemovedInLastWorkUnit");
// It's not a last work unit
TestQueryBasedExtractor testExtractor = new TestQueryBasedExtractor(workUnitState, records);
testExtractor.setRangePredicates(1, 3);
this.verify(testExtractor, 3);
// It's a last work unit but user specifies high watermark
workUnit.setProp(Partition.IS_LAST_PARTIITON, true);
workUnit.setProp(Partition.HAS_USER_SPECIFIED_HIGH_WATERMARK, true);
testExtractor.reset();
testExtractor.setRangePredicates(1, 3);
this.verify(testExtractor, 3);
// It's a last work unit but it has WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY on record
workUnit.removeProp(Partition.HAS_USER_SPECIFIED_HIGH_WATERMARK);
workUnit.setProp(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY, "3");
testExtractor.reset();
testExtractor.setRangePredicates(1, 3);
this.verify(testExtractor, 3);
}
Aggregations