Search in sources :

Example 66 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class CopyDataPublisher method groupByFileSet.

/**
 * Create a {@link Multimap} that maps a {@link CopyableDataset} to all {@link WorkUnitState}s that belong to this
 * {@link CopyableDataset}. This mapping is used to set WorkingState of all {@link WorkUnitState}s to
 * {@link WorkUnitState.WorkingState#COMMITTED} after a {@link CopyableDataset} is successfully published.
 */
private static Multimap<CopyEntity.DatasetAndPartition, WorkUnitState> groupByFileSet(Collection<? extends WorkUnitState> states) {
    Multimap<CopyEntity.DatasetAndPartition, WorkUnitState> datasetRoots = ArrayListMultimap.create();
    for (WorkUnitState workUnitState : states) {
        CopyEntity file = CopySource.deserializeCopyEntity(workUnitState);
        CopyEntity.DatasetAndPartition datasetAndPartition = file.getDatasetAndPartition(CopyableDatasetMetadata.deserialize(workUnitState.getProp(CopySource.SERIALIZED_COPYABLE_DATASET)));
        datasetRoots.put(datasetAndPartition, workUnitState);
    }
    return datasetRoots;
}
Also used : CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState)

Example 67 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class CopyDataPublisher method getCommitSequence.

private static List<CommitStep> getCommitSequence(Collection<WorkUnitState> workUnits, Class<?> baseClass) throws IOException {
    List<CommitStepCopyEntity> steps = Lists.newArrayList();
    for (WorkUnitState wus : workUnits) {
        if (baseClass.isAssignableFrom(CopySource.getCopyEntityClass(wus))) {
            CommitStepCopyEntity step = (CommitStepCopyEntity) CopySource.deserializeCopyEntity(wus);
            steps.add(step);
        }
    }
    Comparator<CommitStepCopyEntity> commitStepSorter = new Comparator<CommitStepCopyEntity>() {

        @Override
        public int compare(CommitStepCopyEntity o1, CommitStepCopyEntity o2) {
            return Integer.compare(o1.getPriority(), o2.getPriority());
        }
    };
    Collections.sort(steps, commitStepSorter);
    List<CommitStep> sequence = Lists.newArrayList();
    for (CommitStepCopyEntity entity : steps) {
        sequence.add(entity.getStep());
    }
    return sequence;
}
Also used : CommitStep(org.apache.gobblin.commit.CommitStep) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) Comparator(java.util.Comparator)

Example 68 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class PartitionLevelWatermarkerTest method testNoPreviousWatermarkWorkunits.

@Test
public void testNoPreviousWatermarkWorkunits() throws Exception {
    // Create one previous workunit with IS_WATERMARK_WORKUNIT_KEY set to true
    WorkUnitState previousWus = new WorkUnitState();
    previousWus.setProp(ConfigurationKeys.DATASET_URN_KEY, "test_dataset_urn");
    previousWus.setProp(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY, true);
    previousWus.setActualHighWatermark(new MultiKeyValueLongWatermark(ImmutableMap.of("2015", 100l)));
    // Create one previous workunit with IS_WATERMARK_WORKUNIT_KEY not set (false)
    WorkUnitState previousWus2 = new WorkUnitState();
    previousWus2.setProp(ConfigurationKeys.DATASET_URN_KEY, "test_dataset_urn2");
    previousWus2.setActualHighWatermark(new LongWatermark(101l));
    SourceState state = new SourceState(new State(), Lists.newArrayList(previousWus, previousWus2));
    PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
    Assert.assertEquals(watermarker.getPreviousWatermarks().size(), 1);
    Assert.assertEquals(watermarker.getPreviousWatermarks().get("test_dataset_urn"), ImmutableMap.of("2015", 100l));
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 69 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class PartitionLevelWatermarkerTest method testStateStoreReadWrite.

@Test
public void testStateStoreReadWrite() throws Exception {
    String dbName = "testStateStoreReadWrite";
    LocalHiveMetastoreTestUtils.getInstance().dropDatabaseIfExists(dbName);
    PartitionLevelWatermarker watermarker0 = new PartitionLevelWatermarker(new SourceState());
    Table mockTable = localTestTable(dbName, "table1", true);
    watermarker0.onTableProcessBegin(mockTable, 0l);
    long now = new DateTime().getMillis();
    watermarker0.onPartitionProcessBegin(localTestPartition(mockTable, ImmutableList.of("2016")), 0, now);
    List<WorkUnit> workunits = Lists.newArrayList();
    watermarker0.onGetWorkunitsEnd(workunits);
    @SuppressWarnings("deprecation") WorkUnitState previousWus = new WorkUnitState(workunits.get(0));
    watermarker0.setActualHighWatermark(previousWus);
    SourceState state = new SourceState(new State(), Lists.newArrayList(previousWus));
    PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
    Assert.assertEquals(watermarker.getPreviousWatermarks().size(), 1);
    Assert.assertEquals(watermarker.getPreviousWatermarks().get(dbName + "@table1"), ImmutableMap.of("2016", now));
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Table(org.apache.hadoop.hive.ql.metadata.Table) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) DateTime(org.joda.time.DateTime) Test(org.testng.annotations.Test)

Example 70 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class PartitionLevelWatermarkerTest method testDroppedPartitions.

@Test
public void testDroppedPartitions() throws Exception {
    WorkUnitState previousWus = new WorkUnitState();
    previousWus.setProp(ConfigurationKeys.DATASET_URN_KEY, "db@test_dataset_urn");
    previousWus.setProp(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY, true);
    previousWus.setActualHighWatermark(new MultiKeyValueLongWatermark(ImmutableMap.of("2015-01", 100l, "2015-02", 101l)));
    SourceState state = new SourceState(new State(), Lists.newArrayList(previousWus));
    PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
    Table table = mockTable("test_dataset_urn");
    Mockito.when(table.getPartitionKeys()).thenReturn(ImmutableList.of(new FieldSchema("year", "string", "")));
    Partition partition2015 = mockPartition(table, ImmutableList.of("2015"));
    // partition 2015 replaces 2015-01 and 2015-02
    Mockito.when(partition2015.getParameters()).thenReturn(ImmutableMap.of(AbstractAvroToOrcConverter.REPLACED_PARTITIONS_HIVE_METASTORE_KEY, "2015-01|2015-02"));
    watermarker.onPartitionProcessBegin(partition2015, 0l, 0l);
    Assert.assertEquals(watermarker.getExpectedHighWatermarks().get("db@test_dataset_urn"), ImmutableMap.of("2015", 0l));
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) SourceState(org.apache.gobblin.configuration.SourceState) Table(org.apache.hadoop.hive.ql.metadata.Table) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Test(org.testng.annotations.Test)

Aggregations

WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)222 Test (org.testng.annotations.Test)143 State (org.apache.gobblin.configuration.State)48 SourceState (org.apache.gobblin.configuration.SourceState)39 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)39 Schema (org.apache.avro.Schema)29 Path (org.apache.hadoop.fs.Path)26 GenericRecord (org.apache.avro.generic.GenericRecord)19 JsonObject (com.google.gson.JsonObject)17 ArrayList (java.util.ArrayList)16 File (java.io.File)14 TaskState (org.apache.hadoop.mapreduce.v2.api.records.TaskState)12 List (java.util.List)11 Configuration (org.apache.hadoop.conf.Configuration)11 IOException (java.io.IOException)10 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)10 Extract (org.apache.gobblin.source.workunit.Extract)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 Closer (com.google.common.io.Closer)8 JsonParser (com.google.gson.JsonParser)8