Search in sources :

Example 6 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method setUp.

@BeforeClass
public void setUp() throws IOException {
    this.schema = new Schema.Parser().parse(AVRO_SCHEMA);
    // set up datetime objects
    DateTime now = new DateTime(TZ).minusHours(6);
    this.startDateTime = new DateTime(now.getYear(), now.getMonthOfYear(), now.getDayOfMonth(), now.getHourOfDay(), 30, 0, TZ);
    // create records, shift their timestamp by 1 minute
    DateTime recordDt = startDateTime;
    recordTimestamps[0] = recordDt.getMillis();
    recordDt = recordDt.plusHours(4);
    for (int i = 1; i < RECORD_SIZE; i++) {
        recordDt = recordDt.plusMinutes(1);
        recordTimestamps[i] = recordDt.getMillis();
    }
    // create dummy data partitioned by minutes
    State state = new State();
    state.setProp(TimeBasedAvroWriterPartitioner.WRITER_PARTITION_COLUMNS, PARTITION_COLUMN_NAME);
    state.setProp(ConfigurationKeys.WRITER_BUFFER_SIZE, ConfigurationKeys.DEFAULT_BUFFER_SIZE);
    state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.WRITER_STAGING_DIR, STAGING_DIR);
    state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, OUTPUT_DIR);
    state.setProp(ConfigurationKeys.WRITER_FILE_PATH, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.WRITER_FILE_NAME, FILE_NAME);
    state.setProp(TimeBasedWriterPartitioner.WRITER_PARTITION_PATTERN, DATE_PATTERN);
    state.setProp(TimeBasedWriterPartitioner.WRITER_PARTITION_PREFIX, PREFIX);
    state.setProp(TimeBasedWriterPartitioner.WRITER_PARTITION_SUFFIX, SUFFIX);
    state.setProp(ConfigurationKeys.WRITER_PARTITIONER_CLASS, TimeBasedAvroWriterPartitioner.class.getName());
    DataWriterBuilder<Schema, GenericRecord> builder = new AvroDataWriterBuilder().writeTo(Destination.of(Destination.DestinationType.HDFS, state)).writeInFormat(WriterOutputFormat.AVRO).withWriterId("writer-1").withSchema(this.schema).withBranches(1).forBranch(0);
    this.writer = new PartitionedDataWriter<Schema, GenericRecord>(builder, state);
    GenericRecordBuilder genericRecordBuilder = new GenericRecordBuilder(this.schema);
    for (int i = 0; i < RECORD_SIZE; i++) {
        genericRecordBuilder.set(PARTITION_COLUMN_NAME, recordTimestamps[i]);
        this.writer.writeEnvelope(new RecordEnvelope<>(genericRecordBuilder.build()));
    }
    this.writer.close();
    this.writer.commit();
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) Schema(org.apache.avro.Schema) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) DateTime(org.joda.time.DateTime) TimeBasedAvroWriterPartitioner(org.apache.gobblin.writer.partitioner.TimeBasedAvroWriterPartitioner) AvroDataWriterBuilder(org.apache.gobblin.writer.AvroDataWriterBuilder) BeforeClass(org.testng.annotations.BeforeClass)

Example 7 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class QueryBasedExtractorTest method testDataPullUpperBoundsRemovedInLastWorkUnit.

@Test
public void testDataPullUpperBoundsRemovedInLastWorkUnit() {
    int totalCount = 5;
    ArrayList<DataRecord> records = this.generateRecords(totalCount);
    WorkUnit workUnit = WorkUnit.createEmpty();
    workUnit.setProp(Partition.IS_LAST_PARTIITON, true);
    workUnit.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT");
    WorkUnitState workUnitState = new WorkUnitState(workUnit, new State());
    workUnitState.setId("testDataPullUpperBoundsRemovedInLastWorkUnit");
    TestQueryBasedExtractor testExtractor = new TestQueryBasedExtractor(workUnitState, records);
    testExtractor.setRangePredicates(1, 3);
    this.verify(testExtractor, totalCount);
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 8 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class QueryBasedExtractorTest method testDataPullUpperBoundsNotRemovedInLastWorkUnit.

@Test
public void testDataPullUpperBoundsNotRemovedInLastWorkUnit() {
    int totalCount = 5;
    ArrayList<DataRecord> records = this.generateRecords(totalCount);
    WorkUnit workUnit = WorkUnit.createEmpty();
    WorkUnitState workUnitState = new WorkUnitState(workUnit, new State());
    workUnitState.setId("testDataPullUpperBoundsNotRemovedInLastWorkUnit");
    // It's not a last work unit
    TestQueryBasedExtractor testExtractor = new TestQueryBasedExtractor(workUnitState, records);
    testExtractor.setRangePredicates(1, 3);
    this.verify(testExtractor, 3);
    // It's a last work unit but user specifies high watermark
    workUnit.setProp(Partition.IS_LAST_PARTIITON, true);
    workUnit.setProp(Partition.HAS_USER_SPECIFIED_HIGH_WATERMARK, true);
    testExtractor.reset();
    testExtractor.setRangePredicates(1, 3);
    this.verify(testExtractor, 3);
    // It's a last work unit but it has WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY on record
    workUnit.removeProp(Partition.HAS_USER_SPECIFIED_HIGH_WATERMARK);
    workUnit.setProp(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY, "3");
    testExtractor.reset();
    testExtractor.setRangePredicates(1, 3);
    this.verify(testExtractor, 3);
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 9 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class QueryBasedSourceTest method testGetTableSpecificPropsFromState.

@Test
public void testGetTableSpecificPropsFromState() {
    SourceState state = new SourceState();
    state.setProp(DatasetUtils.DATASET_SPECIFIC_PROPS, "[{\"dataset\":\"Entity1\", \"value\": 1}, {\"dataset\":\"Table2\", \"value\":2}]");
    // We should look in the dataset specific properties using the entity name, not table name
    SourceEntity se1 = new SourceEntity("Entity1", "Table2");
    SourceEntity se3 = new SourceEntity("Entity3", "Table3");
    Set<SourceEntity> entities = ImmutableSet.of(se1, se3);
    Map<SourceEntity, State> datasetProps = QueryBasedSource.getTableSpecificPropsFromState(entities, state);
    // Value 1 should be returned for se1, no prpos should be returned for se3
    Assert.assertEquals(datasetProps.size(), 1);
    Assert.assertTrue(datasetProps.containsKey(se1));
    State se1Props = datasetProps.get(se1);
    Assert.assertEquals(se1Props.getProp("value"), "1");
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) SourceEntity(org.apache.gobblin.source.extractor.extract.QueryBasedSource.SourceEntity) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkingState(org.apache.gobblin.configuration.WorkUnitState.WorkingState) SourceState(org.apache.gobblin.configuration.SourceState) Test(org.testng.annotations.Test)

Example 10 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class QueryBasedSourceTest method testSourceEntity.

@Test
public void testSourceEntity() {
    SourceEntity se1 = SourceEntity.fromSourceEntityName("SourceEntity1");
    Assert.assertEquals(se1.getSourceEntityName(), "SourceEntity1");
    Assert.assertEquals(se1.getDestTableName(), "SourceEntity1");
    Assert.assertEquals(se1.getDatasetName(), "SourceEntity1");
    SourceEntity se2 = SourceEntity.fromSourceEntityName("SourceEntity$2");
    Assert.assertEquals(se2.getSourceEntityName(), "SourceEntity$2");
    Assert.assertEquals(se2.getDestTableName(), "SourceEntity_2");
    Assert.assertEquals(se2.getDatasetName(), "SourceEntity$2");
    State st1 = new State();
    st1.setProp(ConfigurationKeys.SOURCE_ENTITY, "SourceEntity3");
    st1.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, "SourceEntity3_Table");
    Optional<SourceEntity> se3 = SourceEntity.fromState(st1);
    Assert.assertTrue(se3.isPresent());
    Assert.assertEquals(se3.get().getSourceEntityName(), "SourceEntity3");
    Assert.assertEquals(se3.get().getDestTableName(), "SourceEntity3_Table");
    Assert.assertEquals(se3.get().getDatasetName(), "SourceEntity3");
    Assert.assertEquals(se3.get(), new SourceEntity("SourceEntity3", "SourceEntity3_Table"));
    State st2 = new State();
    st2.setProp(ConfigurationKeys.SOURCE_ENTITY, "SourceEntity$4");
    Optional<SourceEntity> se4 = SourceEntity.fromState(st2);
    Assert.assertTrue(se4.isPresent());
    Assert.assertEquals(se4.get(), SourceEntity.fromSourceEntityName("SourceEntity$4"));
    State st3 = new State();
    st3.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, "Table5");
    Optional<SourceEntity> se5 = SourceEntity.fromState(st3);
    Assert.assertTrue(se5.isPresent());
    Assert.assertEquals(se5.get(), SourceEntity.fromSourceEntityName("Table5"));
}
Also used : SourceEntity(org.apache.gobblin.source.extractor.extract.QueryBasedSource.SourceEntity) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkingState(org.apache.gobblin.configuration.WorkUnitState.WorkingState) SourceState(org.apache.gobblin.configuration.SourceState) Test(org.testng.annotations.Test)

Aggregations

State (org.apache.gobblin.configuration.State)195 Test (org.testng.annotations.Test)103 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)74 SourceState (org.apache.gobblin.configuration.SourceState)38 Path (org.apache.hadoop.fs.Path)30 File (java.io.File)20 IOException (java.io.IOException)16 Map (java.util.Map)14 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)14 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)14 TaskState (org.apache.hadoop.mapreduce.v2.api.records.TaskState)13 Properties (java.util.Properties)12 FinalState (org.apache.gobblin.util.FinalState)12 Configuration (org.apache.hadoop.conf.Configuration)12 TaskLevelPolicyCheckResults (org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults)9 Config (com.typesafe.config.Config)8 ArrayList (java.util.ArrayList)8 GenericRecord (org.apache.avro.generic.GenericRecord)8 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 FileInputStream (java.io.FileInputStream)6