Search in sources :

Example 1 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class QueryBasedSource method generateWorkUnits.

protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) {
    List<WorkUnit> workUnits = Lists.newArrayList();
    String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
    TableType tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase());
    List<Partition> partitions = new Partitioner(state).getPartitionList(previousWatermark);
    Collections.sort(partitions, Partitioner.ascendingComparator);
    // {@link ConfigurationKeys.EXTRACT_TABLE_NAME_KEY} specify the output path for Extract
    String outputTableName = sourceEntity.getDestTableName();
    log.info("Create extract output with table name is " + outputTableName);
    Extract extract = createExtract(tableType, nameSpaceName, outputTableName);
    // Setting current time for the full extract
    if (Boolean.valueOf(state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY))) {
        extract.setFullTrue(System.currentTimeMillis());
    }
    for (Partition partition : partitions) {
        WorkUnit workunit = WorkUnit.create(extract);
        workunit.setProp(ConfigurationKeys.SOURCE_ENTITY, sourceEntity.getSourceEntityName());
        workunit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, sourceEntity.getDestTableName());
        workunit.setProp(WORK_UNIT_STATE_VERSION_KEY, CURRENT_WORK_UNIT_STATE_VERSION);
        addLineageSourceInfo(state, sourceEntity, workunit);
        partition.serialize(workunit);
        workUnits.add(workunit);
    }
    return workUnits;
}
Also used : Partition(org.apache.gobblin.source.extractor.partition.Partition) TableType(org.apache.gobblin.source.workunit.Extract.TableType) Extract(org.apache.gobblin.source.workunit.Extract) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Partitioner(org.apache.gobblin.source.extractor.partition.Partitioner)

Example 2 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class FileBasedSource method getWorkunits.

/**
 * This method takes the snapshot seen in the previous run, and compares it to the list
 * of files currently in the source - it then decided which files it needs to pull
 * and distributes those files across the workunits; it does this comparison by comparing
 * the names of the files currently in the source vs. the names retrieved from the
 * previous state
 * @param state is the source state
 * @return a list of workunits for the framework to run
 */
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    initLogger(state);
    try {
        initFileSystemHelper(state);
    } catch (FileBasedHelperException e) {
        Throwables.propagate(e);
    }
    log.info("Getting work units");
    String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
    String entityName = state.getProp(ConfigurationKeys.SOURCE_ENTITY);
    // Override extract table name
    String extractTableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
    // If extract table name is not found then consider entity name as extract table name
    if (Strings.isNullOrEmpty(extractTableName)) {
        extractTableName = entityName;
    }
    TableType tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase());
    List<WorkUnitState> previousWorkunits = Lists.newArrayList(state.getPreviousWorkUnitStates());
    Set<String> prevFsSnapshot = Sets.newHashSet();
    // Get list of files seen in the previous run
    if (!previousWorkunits.isEmpty()) {
        if (previousWorkunits.get(0).getWorkunit().contains(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT)) {
            prevFsSnapshot.addAll(previousWorkunits.get(0).getWorkunit().getPropAsSet(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT));
        } else if (state.getPropAsBoolean(ConfigurationKeys.SOURCE_FILEBASED_FS_PRIOR_SNAPSHOT_REQUIRED, ConfigurationKeys.DEFAULT_SOURCE_FILEBASED_FS_PRIOR_SNAPSHOT_REQUIRED)) {
            // don't accidentally read files that have already been processed.
            throw new RuntimeException(String.format("No '%s' found on state of prior job", ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT));
        }
    }
    List<WorkUnit> workUnits = Lists.newArrayList();
    List<WorkUnit> previousWorkUnitsForRetry = this.getPreviousWorkUnitsForRetry(state);
    log.info("Total number of work units from the previous failed runs: " + previousWorkUnitsForRetry.size());
    for (WorkUnit previousWorkUnitForRetry : previousWorkUnitsForRetry) {
        prevFsSnapshot.addAll(previousWorkUnitForRetry.getPropAsSet(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL));
        workUnits.add(previousWorkUnitForRetry);
    }
    // Get list of files that need to be pulled
    List<String> currentFsSnapshot = this.getcurrentFsSnapshot(state);
    // The snapshot we want to save. This might not be the full snapshot if we don't pull all files.
    List<String> effectiveSnapshot = Lists.newArrayList();
    List<String> filesToPull = Lists.newArrayList();
    int maxFilesToPull = state.getPropAsInt(ConfigurationKeys.SOURCE_FILEBASED_MAX_FILES_PER_RUN, Integer.MAX_VALUE);
    int filesSelectedForPull = 0;
    if (currentFsSnapshot.size() > maxFilesToPull) {
        // if we're going to not pull all files, sort them lexicographically so there is some order in which they are ingested
        // note currentFsSnapshot.size > maxFilesToPull does not imply we will ignore some of them, as we still have to diff
        // against the previous snapshot. Just a quick check if it even makes sense to sort the files.
        Collections.sort(currentFsSnapshot);
    }
    for (String file : currentFsSnapshot) {
        if (prevFsSnapshot.contains(file)) {
            effectiveSnapshot.add(file);
        } else if ((filesSelectedForPull++) < maxFilesToPull) {
            filesToPull.add(file.split(this.splitPattern)[0]);
            effectiveSnapshot.add(file);
        } else {
        // file is not pulled this run
        }
    }
    // Otherwise a corrupt file could cause re-processing of already processed files
    for (WorkUnit workUnit : previousWorkUnitsForRetry) {
        workUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT, StringUtils.join(effectiveSnapshot, ","));
    }
    if (!filesToPull.isEmpty()) {
        logFilesToPull(filesToPull);
        int numPartitions = state.contains(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) && state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) <= filesToPull.size() ? state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) : filesToPull.size();
        if (numPartitions <= 0) {
            throw new IllegalArgumentException("The number of partitions should be positive");
        }
        int filesPerPartition = filesToPull.size() % numPartitions == 0 ? filesToPull.size() / numPartitions : filesToPull.size() / numPartitions + 1;
        // Distribute the files across the workunits
        for (int fileOffset = 0; fileOffset < filesToPull.size(); fileOffset += filesPerPartition) {
            // Use extract table name to create extract
            Extract extract = new Extract(tableType, nameSpaceName, extractTableName);
            WorkUnit workUnit = WorkUnit.create(extract);
            // Eventually these setters should be integrated with framework support for generalized watermark handling
            workUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT, StringUtils.join(effectiveSnapshot, ","));
            List<String> partitionFilesToPull = filesToPull.subList(fileOffset, fileOffset + filesPerPartition > filesToPull.size() ? filesToPull.size() : fileOffset + filesPerPartition);
            workUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, StringUtils.join(partitionFilesToPull, ","));
            if (state.getPropAsBoolean(ConfigurationKeys.SOURCE_FILEBASED_PRESERVE_FILE_NAME, false)) {
                if (partitionFilesToPull.size() != 1) {
                    throw new RuntimeException("Cannot preserve the file name if a workunit is given multiple files");
                }
                workUnit.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, workUnit.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL));
            }
            workUnits.add(workUnit);
        }
        log.info("Total number of work units for the current run: " + (workUnits.size() - previousWorkUnitsForRetry.size()));
    }
    return workUnits;
}
Also used : TableType(org.apache.gobblin.source.workunit.Extract.TableType) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 3 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class HadoopFileInputSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    try {
        Job job = Job.getInstance(new Configuration());
        if (state.contains(FILE_INPUT_PATHS_KEY)) {
            for (String inputPath : state.getPropAsList(FILE_INPUT_PATHS_KEY)) {
                FileInputFormat.addInputPath(job, new Path(inputPath));
            }
        }
        FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, job.getConfiguration());
        List<InputSplit> fileSplits = fileInputFormat.getSplits(job);
        if (fileSplits == null || fileSplits.isEmpty()) {
            return ImmutableList.of();
        }
        Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null;
        String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
        String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
        List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.size());
        for (InputSplit inputSplit : fileSplits) {
            // Create one WorkUnit per InputSplit
            FileSplit fileSplit = (FileSplit) inputSplit;
            Extract extract = createExtract(tableType, tableNamespace, tableName);
            WorkUnit workUnit = WorkUnit.create(extract);
            workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
            workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
            workUnits.add(workUnit);
        }
        return workUnits;
    } catch (IOException ioe) {
        throw new RuntimeException("Failed to get workunits", ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Extract(org.apache.gobblin.source.workunit.Extract) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 4 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class OldApiHadoopFileInputSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    JobConf jobConf = new JobConf(new Configuration());
    for (String key : state.getPropertyNames()) {
        jobConf.set(key, state.getProp(key));
    }
    if (state.contains(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) {
        for (String inputPath : state.getPropAsList(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) {
            FileInputFormat.addInputPath(jobConf, new Path(inputPath));
        }
    }
    try {
        FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, jobConf);
        InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf, state.getPropAsInt(HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY, HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED));
        if (fileSplits == null || fileSplits.length == 0) {
            return ImmutableList.of();
        }
        Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null;
        String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
        String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
        List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.length);
        for (InputSplit inputSplit : fileSplits) {
            // Create one WorkUnit per InputSplit
            FileSplit fileSplit = (FileSplit) inputSplit;
            Extract extract = createExtract(tableType, tableNamespace, tableName);
            WorkUnit workUnit = WorkUnit.create(extract);
            workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
            workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
            workUnits.add(workUnit);
        }
        return workUnits;
    } catch (IOException ioe) {
        throw new RuntimeException("Failed to get workunits", ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Extract(org.apache.gobblin.source.workunit.Extract) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapred.FileSplit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 5 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class WikipediaSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    Map<String, Iterable<WorkUnitState>> previousWorkUnits = state.getPreviousWorkUnitStatesByDatasetUrns();
    List<String> titles = new LinkedList<>(Splitter.on(",").omitEmptyStrings().splitToList(state.getProp(WikipediaExtractor.SOURCE_PAGE_TITLES)));
    Map<String, LongWatermark> prevHighWatermarks = Maps.newHashMap();
    for (Map.Entry<String, Iterable<WorkUnitState>> entry : previousWorkUnits.entrySet()) {
        Iterable<LongWatermark> watermarks = Iterables.transform(entry.getValue(), new Function<WorkUnitState, LongWatermark>() {

            @Override
            public LongWatermark apply(WorkUnitState wus) {
                return wus.getActualHighWatermark(LongWatermark.class);
            }
        });
        watermarks = Iterables.filter(watermarks, Predicates.notNull());
        List<LongWatermark> watermarkList = Lists.newArrayList(watermarks);
        if (watermarkList.size() > 0) {
            prevHighWatermarks.put(entry.getKey(), Collections.max(watermarkList));
        }
    }
    Extract extract = createExtract(TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), "WikipediaOutput");
    List<WorkUnit> workUnits = Lists.newArrayList();
    for (String title : titles) {
        LongWatermark prevWatermark = prevHighWatermarks.containsKey(title) ? prevHighWatermarks.get(title) : new LongWatermark(-1);
        prevHighWatermarks.remove(title);
        WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1)));
        workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title);
        workUnits.add(workUnit);
    }
    for (Map.Entry<String, LongWatermark> nonProcessedDataset : prevHighWatermarks.entrySet()) {
        WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(), nonProcessedDataset.getValue()));
        workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
        workUnits.add(workUnit);
    }
    return workUnits;
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Extract(org.apache.gobblin.source.workunit.Extract) LinkedList(java.util.LinkedList) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Map(java.util.Map) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Aggregations

Extract (org.apache.gobblin.source.workunit.Extract)29 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)24 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)11 SourceState (org.apache.gobblin.configuration.SourceState)8 Test (org.testng.annotations.Test)7 Path (org.apache.hadoop.fs.Path)6 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 Configuration (org.apache.hadoop.conf.Configuration)3 Gson (com.google.gson.Gson)2 JsonObject (com.google.gson.JsonObject)2 Config (com.typesafe.config.Config)2 InputStreamReader (java.io.InputStreamReader)2 Type (java.lang.reflect.Type)2 Map (java.util.Map)2 State (org.apache.gobblin.configuration.State)2 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)2 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)2 TableType (org.apache.gobblin.source.workunit.Extract.TableType)2