Search in sources :

Example 6 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HadoopFileInputSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    try {
        Job job = Job.getInstance(new Configuration());
        if (state.contains(FILE_INPUT_PATHS_KEY)) {
            for (String inputPath : state.getPropAsList(FILE_INPUT_PATHS_KEY)) {
                FileInputFormat.addInputPath(job, new Path(inputPath));
            }
        }
        FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, job.getConfiguration());
        List<InputSplit> fileSplits = fileInputFormat.getSplits(job);
        if (fileSplits == null || fileSplits.isEmpty()) {
            return ImmutableList.of();
        }
        Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null;
        String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
        String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
        List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.size());
        for (InputSplit inputSplit : fileSplits) {
            // Create one WorkUnit per InputSplit
            FileSplit fileSplit = (FileSplit) inputSplit;
            Extract extract = createExtract(tableType, tableNamespace, tableName);
            WorkUnit workUnit = WorkUnit.create(extract);
            workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
            workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
            workUnits.add(workUnit);
        }
        return workUnits;
    } catch (IOException ioe) {
        throw new RuntimeException("Failed to get workunits", ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Extract(org.apache.gobblin.source.workunit.Extract) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 7 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class OldApiHadoopFileInputSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    JobConf jobConf = new JobConf(new Configuration());
    for (String key : state.getPropertyNames()) {
        jobConf.set(key, state.getProp(key));
    }
    if (state.contains(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) {
        for (String inputPath : state.getPropAsList(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) {
            FileInputFormat.addInputPath(jobConf, new Path(inputPath));
        }
    }
    try {
        FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, jobConf);
        InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf, state.getPropAsInt(HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY, HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED));
        if (fileSplits == null || fileSplits.length == 0) {
            return ImmutableList.of();
        }
        Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null;
        String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
        String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
        List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.length);
        for (InputSplit inputSplit : fileSplits) {
            // Create one WorkUnit per InputSplit
            FileSplit fileSplit = (FileSplit) inputSplit;
            Extract extract = createExtract(tableType, tableNamespace, tableName);
            WorkUnit workUnit = WorkUnit.create(extract);
            workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
            workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
            workUnits.add(workUnit);
        }
        return workUnits;
    } catch (IOException ioe) {
        throw new RuntimeException("Failed to get workunits", ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Extract(org.apache.gobblin.source.workunit.Extract) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapred.FileSplit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 8 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HiveSerDeTest method testAvroOrcSerDes.

/**
 * This test uses Avro SerDe to deserialize data from Avro files, and use ORC SerDe
 * to serialize them into ORC files.
 */
@Test(groups = { "gobblin.serde" })
public void testAvroOrcSerDes() throws IOException, DataRecordException, DataConversionException {
    Properties properties = new Properties();
    properties.load(new FileReader("gobblin-core/src/test/resources/serde/serde.properties"));
    SourceState sourceState = new SourceState(new State(properties), ImmutableList.<WorkUnitState>of());
    OldApiWritableFileSource source = new OldApiWritableFileSource();
    List<WorkUnit> workUnits = source.getWorkunits(sourceState);
    Assert.assertEquals(workUnits.size(), 1);
    WorkUnitState wus = new WorkUnitState(workUnits.get(0));
    wus.addAll(sourceState);
    Closer closer = Closer.create();
    HiveWritableHdfsDataWriter writer = null;
    try {
        OldApiWritableFileExtractor extractor = closer.register((OldApiWritableFileExtractor) source.getExtractor(wus));
        HiveSerDeConverter converter = closer.register(new HiveSerDeConverter());
        writer = closer.register((HiveWritableHdfsDataWriter) new HiveWritableHdfsDataWriterBuilder<>().withBranches(1).withWriterId("0").writeTo(Destination.of(DestinationType.HDFS, sourceState)).writeInFormat(WriterOutputFormat.ORC).build());
        converter.init(wus);
        Writable record;
        while ((record = extractor.readRecord(null)) != null) {
            Iterable<Writable> convertedRecordIterable = converter.convertRecordImpl(null, record, wus);
            Assert.assertEquals(Iterators.size(convertedRecordIterable.iterator()), 1);
            writer.write(convertedRecordIterable.iterator().next());
        }
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
        if (writer != null) {
            writer.commit();
        }
        Assert.assertTrue(this.fs.exists(new Path(sourceState.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR), sourceState.getProp(ConfigurationKeys.WRITER_FILE_NAME))));
        HadoopUtils.deletePath(this.fs, new Path(sourceState.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR)), true);
    }
}
Also used : Closer(com.google.common.io.Closer) Path(org.apache.hadoop.fs.Path) SourceState(org.apache.gobblin.configuration.SourceState) OldApiWritableFileExtractor(org.apache.gobblin.source.extractor.hadoop.OldApiWritableFileExtractor) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) HiveSerDeConverter(org.apache.gobblin.converter.serde.HiveSerDeConverter) Writable(org.apache.hadoop.io.Writable) Properties(java.util.Properties) HiveWritableHdfsDataWriterBuilder(org.apache.gobblin.writer.HiveWritableHdfsDataWriterBuilder) HiveWritableHdfsDataWriter(org.apache.gobblin.writer.HiveWritableHdfsDataWriter) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) FileReader(java.io.FileReader) OldApiWritableFileSource(org.apache.gobblin.source.extractor.hadoop.OldApiWritableFileSource) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 9 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class QueryBasedExtractorTest method testDataPullUpperBoundsRemovedInLastWorkUnit.

@Test
public void testDataPullUpperBoundsRemovedInLastWorkUnit() {
    int totalCount = 5;
    ArrayList<DataRecord> records = this.generateRecords(totalCount);
    WorkUnit workUnit = WorkUnit.createEmpty();
    workUnit.setProp(Partition.IS_LAST_PARTIITON, true);
    workUnit.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT");
    WorkUnitState workUnitState = new WorkUnitState(workUnit, new State());
    workUnitState.setId("testDataPullUpperBoundsRemovedInLastWorkUnit");
    TestQueryBasedExtractor testExtractor = new TestQueryBasedExtractor(workUnitState, records);
    testExtractor.setRangePredicates(1, 3);
    this.verify(testExtractor, totalCount);
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 10 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class QueryBasedExtractorTest method testDataPullUpperBoundsNotRemovedInLastWorkUnit.

@Test
public void testDataPullUpperBoundsNotRemovedInLastWorkUnit() {
    int totalCount = 5;
    ArrayList<DataRecord> records = this.generateRecords(totalCount);
    WorkUnit workUnit = WorkUnit.createEmpty();
    WorkUnitState workUnitState = new WorkUnitState(workUnit, new State());
    workUnitState.setId("testDataPullUpperBoundsNotRemovedInLastWorkUnit");
    // It's not a last work unit
    TestQueryBasedExtractor testExtractor = new TestQueryBasedExtractor(workUnitState, records);
    testExtractor.setRangePredicates(1, 3);
    this.verify(testExtractor, 3);
    // It's a last work unit but user specifies high watermark
    workUnit.setProp(Partition.IS_LAST_PARTIITON, true);
    workUnit.setProp(Partition.HAS_USER_SPECIFIED_HIGH_WATERMARK, true);
    testExtractor.reset();
    testExtractor.setRangePredicates(1, 3);
    this.verify(testExtractor, 3);
    // It's a last work unit but it has WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY on record
    workUnit.removeProp(Partition.HAS_USER_SPECIFIED_HIGH_WATERMARK);
    workUnit.setProp(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY, "3");
    testExtractor.reset();
    testExtractor.setRangePredicates(1, 3);
    this.verify(testExtractor, 3);
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Aggregations

WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)133 Test (org.testng.annotations.Test)59 SourceState (org.apache.gobblin.configuration.SourceState)40 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)40 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)35 Extract (org.apache.gobblin.source.workunit.Extract)24 Path (org.apache.hadoop.fs.Path)19 State (org.apache.gobblin.configuration.State)13 IOException (java.io.IOException)11 ArrayList (java.util.ArrayList)10 Closer (com.google.common.io.Closer)9 Properties (java.util.Properties)9 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)8 List (java.util.List)7 Table (org.apache.hadoop.hive.ql.metadata.Table)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Config (com.typesafe.config.Config)6 File (java.io.File)6 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6