Search in sources :

Example 26 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class JsonIntermediateToAvroConverterTest method initResources.

private JsonObject initResources(String resourceFilePath) {
    Type listType = new TypeToken<JsonObject>() {
    }.getType();
    Gson gson = new Gson();
    JsonObject testData = gson.fromJson(new InputStreamReader(this.getClass().getResourceAsStream(resourceFilePath)), listType);
    jsonRecord = testData.get("record").getAsJsonObject();
    jsonSchema = testData.get("schema").getAsJsonArray();
    WorkUnit workUnit = new WorkUnit(new SourceState(), new Extract(new SourceState(), Extract.TableType.SNAPSHOT_ONLY, "namespace", "dummy_table"));
    state = new WorkUnitState(workUnit);
    state.setProp(ConfigurationKeys.CONVERTER_AVRO_TIME_FORMAT, "HH:mm:ss");
    state.setProp(ConfigurationKeys.CONVERTER_AVRO_DATE_TIMEZONE, "PST");
    return testData;
}
Also used : Type(java.lang.reflect.Type) SourceState(org.apache.gobblin.configuration.SourceState) InputStreamReader(java.io.InputStreamReader) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) JsonObject(com.google.gson.JsonObject) Gson(com.google.gson.Gson) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 27 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class SimpleJsonSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    List<WorkUnit> workUnits = Lists.newArrayList();
    if (!state.contains(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL)) {
        return workUnits;
    }
    // Create a single snapshot-type extract for all files
    Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "ExampleNamespace"), "ExampleTable");
    String filesToPull = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL);
    for (String file : Splitter.on(',').omitEmptyStrings().split(filesToPull)) {
        // Create one work unit for each file to pull
        WorkUnit workUnit = WorkUnit.create(extract);
        workUnit.setProp(SOURCE_FILE_KEY, file);
        workUnits.add(workUnit);
    }
    return workUnits;
}
Also used : Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 28 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class PartitionedFileSourceBase method addNewWorkUnits.

/**
 * Helper method to add new {@link WorkUnit}s for this job. It iterates through a date partitioned directory and
 * creates a {@link WorkUnit} for each file that needs to be processed. It then adds that {@link WorkUnit} to a
 * {@link MultiWorkUnitWeightedQueue}
 */
private void addNewWorkUnits(MultiWorkUnitWeightedQueue multiWorkUnitWeightedQueue) {
    try {
        List<PartitionAwareFileRetriever.FileInfo> filesToPull = retriever.getFilesToProcess(this.lowWaterMark, this.maxFilesPerJob - this.fileCount);
        Collections.sort(filesToPull);
        String topicName = this.sourceDir.getName();
        String namespace = this.sourceState.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
        Map<Long, Extract> extractMap = new HashMap<>();
        for (PartitionAwareFileRetriever.FileInfo file : filesToPull) {
            Extract extract = getExtractForFile(file, topicName, namespace, extractMap);
            LOG.info("Will process file " + file.getFilePath());
            WorkUnit singleWorkUnit = WorkUnit.create(extract);
            singleWorkUnit.setProp(ConfigurationKeys.SOURCE_ENTITY, topicName);
            singleWorkUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, file.getFilePath());
            singleWorkUnit.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, file.getWatermarkMsSinceEpoch());
            singleWorkUnit.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, file.getWatermarkMsSinceEpoch());
            singleWorkUnit.setProp(ConfigurationKeys.WORK_UNIT_DATE_PARTITION_KEY, file.getWatermarkMsSinceEpoch());
            if (this.sourceState.getPropAsBoolean(ConfigurationKeys.SCHEMA_IN_SOURCE_DIR, ConfigurationKeys.DEFAULT_SCHEMA_IN_SOURCE_DIR)) {
                addSchemaFile(file, singleWorkUnit);
            }
            multiWorkUnitWeightedQueue.addWorkUnit(singleWorkUnit, file.getFileSize());
            this.fileCount++;
        }
        LOG.info("Total number of files extracted for the current run: " + filesToPull.size());
    } catch (IOException e) {
        Throwables.propagate(e);
    }
}
Also used : HashMap(java.util.HashMap) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) IOException(java.io.IOException)

Example 29 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class WriterUtilsTest method testGetWriterFilePath.

@Test
public void testGetWriterFilePath() {
    Extract extract = new Extract(TableType.SNAPSHOT_ONLY, "org.apache.gobblin.dbNamespace", "tableName");
    WorkUnit state = WorkUnit.create(extract);
    state.setProp(ConfigurationKeys.WRITER_FILE_PATH, TEST_WRITER_FILE_PATH);
    Assert.assertEquals(WriterUtils.getWriterFilePath(state, 0, 0), TEST_WRITER_FILE_PATH);
    state.setProp(ConfigurationKeys.WRITER_FILE_PATH + ".0", TEST_WRITER_FILE_PATH);
    Assert.assertEquals(WriterUtils.getWriterFilePath(state, 1, 1), TEST_WRITER_FILE_PATH);
    state.removeProp(ConfigurationKeys.WRITER_FILE_PATH);
    state.setProp(ConfigurationKeys.WRITER_FILE_PATH_TYPE, "tablename");
    Assert.assertEquals(WriterUtils.getWriterFilePath(state, 0, 0), new Path("tableName"));
    state.setProp(ConfigurationKeys.WRITER_FILE_PATH_TYPE, "namespace_table");
    Assert.assertEquals(WriterUtils.getWriterFilePath(state, 0, 0), new Path("org/apache/gobblin/dbNamespace/tableName"));
}
Also used : Path(org.apache.hadoop.fs.Path) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Aggregations

Extract (org.apache.gobblin.source.workunit.Extract)29 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)24 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)11 SourceState (org.apache.gobblin.configuration.SourceState)8 Test (org.testng.annotations.Test)7 Path (org.apache.hadoop.fs.Path)6 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 Configuration (org.apache.hadoop.conf.Configuration)3 Gson (com.google.gson.Gson)2 JsonObject (com.google.gson.JsonObject)2 Config (com.typesafe.config.Config)2 InputStreamReader (java.io.InputStreamReader)2 Type (java.lang.reflect.Type)2 Map (java.util.Map)2 State (org.apache.gobblin.configuration.State)2 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)2 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)2 TableType (org.apache.gobblin.source.workunit.Extract.TableType)2