use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class GobblinHelixTaskTest method testPrepareTask.
@Test
public void testPrepareTask() throws IOException {
// Serialize the JobState that will be read later in GobblinHelixTask
Path jobStateFilePath = new Path(appWorkDir, TestHelper.TEST_JOB_ID + "." + AbstractJobLauncher.JOB_STATE_FILE_NAME);
JobState jobState = new JobState();
jobState.setJobName(TestHelper.TEST_JOB_NAME);
jobState.setJobId(TestHelper.TEST_JOB_ID);
SerializationUtils.serializeState(this.localFs, jobStateFilePath, jobState);
// Prepare the WorkUnit
WorkUnit workUnit = WorkUnit.createEmpty();
prepareWorkUnit(workUnit);
// Prepare the source Json file
File sourceJsonFile = new File(this.appWorkDir.toString(), TestHelper.TEST_JOB_NAME + ".json");
TestHelper.createSourceJsonFile(sourceJsonFile);
workUnit.setProp(SimpleJsonSource.SOURCE_FILE_KEY, sourceJsonFile.getAbsolutePath());
// Serialize the WorkUnit into a file
// expected path is appWorkDir/_workunits/job_id/job_id.wu
Path workUnitDirPath = new Path(this.appWorkDir, GobblinClusterConfigurationKeys.INPUT_WORK_UNIT_DIR_NAME);
FsStateStore<WorkUnit> wuStateStore = new FsStateStore<>(this.localFs, workUnitDirPath.toString(), WorkUnit.class);
Path workUnitFilePath = new Path(new Path(workUnitDirPath, TestHelper.TEST_JOB_ID), TestHelper.TEST_JOB_NAME + ".wu");
wuStateStore.put(TestHelper.TEST_JOB_ID, TestHelper.TEST_JOB_NAME + ".wu", workUnit);
Assert.assertTrue(this.localFs.exists(workUnitFilePath));
// Prepare the GobblinHelixTask
Map<String, String> taskConfigMap = Maps.newHashMap();
taskConfigMap.put(GobblinClusterConfigurationKeys.WORK_UNIT_FILE_PATH, workUnitFilePath.toString());
taskConfigMap.put(ConfigurationKeys.JOB_NAME_KEY, TestHelper.TEST_JOB_NAME);
taskConfigMap.put(ConfigurationKeys.JOB_ID_KEY, TestHelper.TEST_JOB_ID);
taskConfigMap.put(ConfigurationKeys.TASK_KEY_KEY, Long.toString(Id.parse(TestHelper.TEST_JOB_ID).getSequence()));
TaskConfig taskConfig = new TaskConfig("", taskConfigMap, true);
TaskCallbackContext taskCallbackContext = Mockito.mock(TaskCallbackContext.class);
Mockito.when(taskCallbackContext.getTaskConfig()).thenReturn(taskConfig);
Mockito.when(taskCallbackContext.getManager()).thenReturn(this.helixManager);
GobblinHelixTaskFactory gobblinHelixTaskFactory = new GobblinHelixTaskFactory(Optional.<ContainerMetrics>absent(), this.taskExecutor, this.taskStateTracker, this.localFs, this.appWorkDir, ConfigFactory.empty(), this.helixManager);
this.gobblinHelixTask = (GobblinHelixTask) gobblinHelixTaskFactory.createNewTask(taskCallbackContext);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class SingleTask method getWorkUnits.
private List<WorkUnit> getWorkUnits() throws IOException {
String fileName = _workUnitFilePath.getName();
String storeName = _workUnitFilePath.getParent().getName();
WorkUnit workUnit;
if (_workUnitFilePath.getName().endsWith(AbstractJobLauncher.MULTI_WORK_UNIT_FILE_EXTENSION)) {
workUnit = _stateStores.getMwuStateStore().getAll(storeName, fileName).get(0);
} else {
workUnit = _stateStores.getWuStateStore().getAll(storeName, fileName).get(0);
}
// The list of individual WorkUnits (flattened) to run
List<WorkUnit> workUnits = Lists.newArrayList();
if (workUnit instanceof MultiWorkUnit) {
// Flatten the MultiWorkUnit so the job configuration properties can be added to each individual WorkUnits
List<WorkUnit> flattenedWorkUnits = JobLauncherUtils.flattenWorkUnits(((MultiWorkUnit) workUnit).getWorkUnits());
workUnits.addAll(flattenedWorkUnits);
} else {
workUnits.add(workUnit);
}
return workUnits;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class SequentialTestSource method initialWorkUnits.
private List<WorkUnit> initialWorkUnits() {
List<WorkUnit> workUnits = Lists.newArrayList();
for (int i = 0; i < num_parallelism; i++) {
WorkUnit workUnit = WorkUnit.create(newExtract(Extract.TableType.APPEND_ONLY, namespace, table));
LongWatermark lowWatermark = new LongWatermark(i * numRecordsPerExtract + 1);
LongWatermark expectedHighWatermark = new LongWatermark((i + 1) * numRecordsPerExtract);
workUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedHighWatermark));
workUnit.setProp(WORK_UNIT_INDEX, i);
workUnits.add(workUnit);
}
return workUnits;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class QueryBasedSource method generateWorkUnits.
protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) {
List<WorkUnit> workUnits = Lists.newArrayList();
String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
TableType tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase());
List<Partition> partitions = new Partitioner(state).getPartitionList(previousWatermark);
Collections.sort(partitions, Partitioner.ascendingComparator);
// {@link ConfigurationKeys.EXTRACT_TABLE_NAME_KEY} specify the output path for Extract
String outputTableName = sourceEntity.getDestTableName();
log.info("Create extract output with table name is " + outputTableName);
Extract extract = createExtract(tableType, nameSpaceName, outputTableName);
// Setting current time for the full extract
if (Boolean.valueOf(state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY))) {
extract.setFullTrue(System.currentTimeMillis());
}
for (Partition partition : partitions) {
WorkUnit workunit = WorkUnit.create(extract);
workunit.setProp(ConfigurationKeys.SOURCE_ENTITY, sourceEntity.getSourceEntityName());
workunit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, sourceEntity.getDestTableName());
workunit.setProp(WORK_UNIT_STATE_VERSION_KEY, CURRENT_WORK_UNIT_STATE_VERSION);
addLineageSourceInfo(state, sourceEntity, workunit);
partition.serialize(workunit);
workUnits.add(workunit);
}
return workUnits;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class FileBasedSource method getWorkunits.
/**
* This method takes the snapshot seen in the previous run, and compares it to the list
* of files currently in the source - it then decided which files it needs to pull
* and distributes those files across the workunits; it does this comparison by comparing
* the names of the files currently in the source vs. the names retrieved from the
* previous state
* @param state is the source state
* @return a list of workunits for the framework to run
*/
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
initLogger(state);
try {
initFileSystemHelper(state);
} catch (FileBasedHelperException e) {
Throwables.propagate(e);
}
log.info("Getting work units");
String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
String entityName = state.getProp(ConfigurationKeys.SOURCE_ENTITY);
// Override extract table name
String extractTableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
// If extract table name is not found then consider entity name as extract table name
if (Strings.isNullOrEmpty(extractTableName)) {
extractTableName = entityName;
}
TableType tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase());
List<WorkUnitState> previousWorkunits = Lists.newArrayList(state.getPreviousWorkUnitStates());
Set<String> prevFsSnapshot = Sets.newHashSet();
// Get list of files seen in the previous run
if (!previousWorkunits.isEmpty()) {
if (previousWorkunits.get(0).getWorkunit().contains(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT)) {
prevFsSnapshot.addAll(previousWorkunits.get(0).getWorkunit().getPropAsSet(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT));
} else if (state.getPropAsBoolean(ConfigurationKeys.SOURCE_FILEBASED_FS_PRIOR_SNAPSHOT_REQUIRED, ConfigurationKeys.DEFAULT_SOURCE_FILEBASED_FS_PRIOR_SNAPSHOT_REQUIRED)) {
// don't accidentally read files that have already been processed.
throw new RuntimeException(String.format("No '%s' found on state of prior job", ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT));
}
}
List<WorkUnit> workUnits = Lists.newArrayList();
List<WorkUnit> previousWorkUnitsForRetry = this.getPreviousWorkUnitsForRetry(state);
log.info("Total number of work units from the previous failed runs: " + previousWorkUnitsForRetry.size());
for (WorkUnit previousWorkUnitForRetry : previousWorkUnitsForRetry) {
prevFsSnapshot.addAll(previousWorkUnitForRetry.getPropAsSet(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL));
workUnits.add(previousWorkUnitForRetry);
}
// Get list of files that need to be pulled
List<String> currentFsSnapshot = this.getcurrentFsSnapshot(state);
// The snapshot we want to save. This might not be the full snapshot if we don't pull all files.
List<String> effectiveSnapshot = Lists.newArrayList();
List<String> filesToPull = Lists.newArrayList();
int maxFilesToPull = state.getPropAsInt(ConfigurationKeys.SOURCE_FILEBASED_MAX_FILES_PER_RUN, Integer.MAX_VALUE);
int filesSelectedForPull = 0;
if (currentFsSnapshot.size() > maxFilesToPull) {
// if we're going to not pull all files, sort them lexicographically so there is some order in which they are ingested
// note currentFsSnapshot.size > maxFilesToPull does not imply we will ignore some of them, as we still have to diff
// against the previous snapshot. Just a quick check if it even makes sense to sort the files.
Collections.sort(currentFsSnapshot);
}
for (String file : currentFsSnapshot) {
if (prevFsSnapshot.contains(file)) {
effectiveSnapshot.add(file);
} else if ((filesSelectedForPull++) < maxFilesToPull) {
filesToPull.add(file.split(this.splitPattern)[0]);
effectiveSnapshot.add(file);
} else {
// file is not pulled this run
}
}
// Otherwise a corrupt file could cause re-processing of already processed files
for (WorkUnit workUnit : previousWorkUnitsForRetry) {
workUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT, StringUtils.join(effectiveSnapshot, ","));
}
if (!filesToPull.isEmpty()) {
logFilesToPull(filesToPull);
int numPartitions = state.contains(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) && state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) <= filesToPull.size() ? state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) : filesToPull.size();
if (numPartitions <= 0) {
throw new IllegalArgumentException("The number of partitions should be positive");
}
int filesPerPartition = filesToPull.size() % numPartitions == 0 ? filesToPull.size() / numPartitions : filesToPull.size() / numPartitions + 1;
// Distribute the files across the workunits
for (int fileOffset = 0; fileOffset < filesToPull.size(); fileOffset += filesPerPartition) {
// Use extract table name to create extract
Extract extract = new Extract(tableType, nameSpaceName, extractTableName);
WorkUnit workUnit = WorkUnit.create(extract);
// Eventually these setters should be integrated with framework support for generalized watermark handling
workUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT, StringUtils.join(effectiveSnapshot, ","));
List<String> partitionFilesToPull = filesToPull.subList(fileOffset, fileOffset + filesPerPartition > filesToPull.size() ? filesToPull.size() : fileOffset + filesPerPartition);
workUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, StringUtils.join(partitionFilesToPull, ","));
if (state.getPropAsBoolean(ConfigurationKeys.SOURCE_FILEBASED_PRESERVE_FILE_NAME, false)) {
if (partitionFilesToPull.size() != 1) {
throw new RuntimeException("Cannot preserve the file name if a workunit is given multiple files");
}
workUnit.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, workUnit.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL));
}
workUnits.add(workUnit);
}
log.info("Total number of work units for the current run: " + (workUnits.size() - previousWorkUnitsForRetry.size()));
}
return workUnits;
}
Aggregations