use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.
the class JsonIntermediateToAvroConverterTest method initResources.
private JsonObject initResources(String resourceFilePath) {
Type listType = new TypeToken<JsonObject>() {
}.getType();
Gson gson = new Gson();
JsonObject testData = gson.fromJson(new InputStreamReader(this.getClass().getResourceAsStream(resourceFilePath)), listType);
jsonRecord = testData.get("record").getAsJsonObject();
jsonSchema = testData.get("schema").getAsJsonArray();
WorkUnit workUnit = new WorkUnit(new SourceState(), new Extract(new SourceState(), Extract.TableType.SNAPSHOT_ONLY, "namespace", "dummy_table"));
state = new WorkUnitState(workUnit);
state.setProp(ConfigurationKeys.CONVERTER_AVRO_TIME_FORMAT, "HH:mm:ss");
state.setProp(ConfigurationKeys.CONVERTER_AVRO_DATE_TIMEZONE, "PST");
return testData;
}
use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.
the class SimpleJsonSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
List<WorkUnit> workUnits = Lists.newArrayList();
if (!state.contains(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL)) {
return workUnits;
}
// Create a single snapshot-type extract for all files
Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "ExampleNamespace"), "ExampleTable");
String filesToPull = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL);
for (String file : Splitter.on(',').omitEmptyStrings().split(filesToPull)) {
// Create one work unit for each file to pull
WorkUnit workUnit = WorkUnit.create(extract);
workUnit.setProp(SOURCE_FILE_KEY, file);
workUnits.add(workUnit);
}
return workUnits;
}
use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.
the class PartitionedFileSourceBase method addNewWorkUnits.
/**
* Helper method to add new {@link WorkUnit}s for this job. It iterates through a date partitioned directory and
* creates a {@link WorkUnit} for each file that needs to be processed. It then adds that {@link WorkUnit} to a
* {@link MultiWorkUnitWeightedQueue}
*/
private void addNewWorkUnits(MultiWorkUnitWeightedQueue multiWorkUnitWeightedQueue) {
try {
List<PartitionAwareFileRetriever.FileInfo> filesToPull = retriever.getFilesToProcess(this.lowWaterMark, this.maxFilesPerJob - this.fileCount);
Collections.sort(filesToPull);
String topicName = this.sourceDir.getName();
String namespace = this.sourceState.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
Map<Long, Extract> extractMap = new HashMap<>();
for (PartitionAwareFileRetriever.FileInfo file : filesToPull) {
Extract extract = getExtractForFile(file, topicName, namespace, extractMap);
LOG.info("Will process file " + file.getFilePath());
WorkUnit singleWorkUnit = WorkUnit.create(extract);
singleWorkUnit.setProp(ConfigurationKeys.SOURCE_ENTITY, topicName);
singleWorkUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, file.getFilePath());
singleWorkUnit.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, file.getWatermarkMsSinceEpoch());
singleWorkUnit.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, file.getWatermarkMsSinceEpoch());
singleWorkUnit.setProp(ConfigurationKeys.WORK_UNIT_DATE_PARTITION_KEY, file.getWatermarkMsSinceEpoch());
if (this.sourceState.getPropAsBoolean(ConfigurationKeys.SCHEMA_IN_SOURCE_DIR, ConfigurationKeys.DEFAULT_SCHEMA_IN_SOURCE_DIR)) {
addSchemaFile(file, singleWorkUnit);
}
multiWorkUnitWeightedQueue.addWorkUnit(singleWorkUnit, file.getFileSize());
this.fileCount++;
}
LOG.info("Total number of files extracted for the current run: " + filesToPull.size());
} catch (IOException e) {
Throwables.propagate(e);
}
}
use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.
the class WriterUtilsTest method testGetWriterFilePath.
@Test
public void testGetWriterFilePath() {
Extract extract = new Extract(TableType.SNAPSHOT_ONLY, "org.apache.gobblin.dbNamespace", "tableName");
WorkUnit state = WorkUnit.create(extract);
state.setProp(ConfigurationKeys.WRITER_FILE_PATH, TEST_WRITER_FILE_PATH);
Assert.assertEquals(WriterUtils.getWriterFilePath(state, 0, 0), TEST_WRITER_FILE_PATH);
state.setProp(ConfigurationKeys.WRITER_FILE_PATH + ".0", TEST_WRITER_FILE_PATH);
Assert.assertEquals(WriterUtils.getWriterFilePath(state, 1, 1), TEST_WRITER_FILE_PATH);
state.removeProp(ConfigurationKeys.WRITER_FILE_PATH);
state.setProp(ConfigurationKeys.WRITER_FILE_PATH_TYPE, "tablename");
Assert.assertEquals(WriterUtils.getWriterFilePath(state, 0, 0), new Path("tableName"));
state.setProp(ConfigurationKeys.WRITER_FILE_PATH_TYPE, "namespace_table");
Assert.assertEquals(WriterUtils.getWriterFilePath(state, 0, 0), new Path("org/apache/gobblin/dbNamespace/tableName"));
}
Aggregations