Search in sources :

Example 6 with PartitionDescriptor

use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.

the class PartitionedDataWriter method serializePartitionInfoToState.

/**
 * Serialize partitions info to {@link #state} if they are any
 */
private void serializePartitionInfoToState() {
    List<PartitionDescriptor> descriptors = new ArrayList<>();
    for (DataWriter writer : partitionWriters.asMap().values()) {
        Descriptor descriptor = writer.getDataDescriptor();
        if (null == descriptor) {
            log.warn("Drop partition info as writer {} returns a null PartitionDescriptor", writer.toString());
            continue;
        }
        if (!(descriptor instanceof PartitionDescriptor)) {
            log.warn("Drop partition info as writer {} does not return a PartitionDescriptor", writer.toString());
            continue;
        }
        descriptors.add((PartitionDescriptor) descriptor);
    }
    if (descriptors.size() > 0) {
        state.setProp(getPartitionsKey(branchId), PartitionDescriptor.toPartitionJsonList(descriptors));
    } else {
        log.info("Partitions info not available. Will not serialize partitions");
    }
}
Also used : ArrayList(java.util.ArrayList) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor) Descriptor(org.apache.gobblin.dataset.Descriptor) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor)

Example 7 with PartitionDescriptor

use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.

the class PartitionedWriterTest method test.

@Test
public void test() throws IOException {
    State state = new State();
    state.setProp(ConfigurationKeys.WRITER_PARTITIONER_CLASS, TestPartitioner.class.getCanonicalName());
    TestPartitionAwareWriterBuilder builder = new TestPartitionAwareWriterBuilder();
    PartitionedDataWriter writer = new PartitionedDataWriter<String, String>(builder, state);
    Assert.assertEquals(builder.actions.size(), 0);
    String record1 = "abc";
    writer.writeEnvelope(new RecordEnvelope(record1));
    Assert.assertEquals(builder.actions.size(), 2);
    TestPartitionAwareWriterBuilder.Action action = builder.actions.poll();
    Assert.assertEquals(action.getPartition(), "a");
    Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.BUILD);
    action = builder.actions.poll();
    Assert.assertEquals(action.getPartition(), "a");
    Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.WRITE);
    Assert.assertEquals(action.getTarget(), record1);
    Assert.assertTrue(writer.isSpeculativeAttemptSafe());
    String record2 = "123";
    writer.writeEnvelope(new RecordEnvelope(record2));
    Assert.assertEquals(builder.actions.size(), 2);
    action = builder.actions.poll();
    Assert.assertEquals(action.getPartition(), "1");
    Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.BUILD);
    Assert.assertFalse(writer.isSpeculativeAttemptSafe());
    action = builder.actions.poll();
    Assert.assertEquals(action.getPartition(), "1");
    Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.WRITE);
    Assert.assertEquals(action.getTarget(), record2);
    writer.writeEnvelope(new RecordEnvelope(record1));
    Assert.assertEquals(builder.actions.size(), 1);
    action = builder.actions.poll();
    Assert.assertEquals(action.getPartition(), "a");
    Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.WRITE);
    Assert.assertEquals(action.getTarget(), record1);
    Assert.assertEquals(writer.recordsWritten(), 3);
    Assert.assertEquals(writer.bytesWritten(), 3);
    Assert.assertFalse(writer.isSpeculativeAttemptSafe());
    writer.cleanup();
    Assert.assertEquals(builder.actions.size(), 2);
    action = builder.actions.poll();
    Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.CLEANUP);
    action = builder.actions.poll();
    Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.CLEANUP);
    // Before close, partitions info is not serialized
    String partitionsKey = "writer.0.partitions";
    Assert.assertTrue(state.getProp(partitionsKey) == null);
    writer.close();
    Assert.assertEquals(builder.actions.size(), 2);
    action = builder.actions.poll();
    Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.CLOSE);
    action = builder.actions.poll();
    Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.CLOSE);
    // After close, partitions info is available
    Assert.assertFalse(Strings.isNullOrEmpty(state.getProp(partitionsKey)));
    List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, 0);
    Assert.assertTrue(state.getProp(partitionsKey) == null);
    Assert.assertEquals(partitions.size(), 2);
    DatasetDescriptor dataset = new DatasetDescriptor("testPlatform", "testDataset");
    Assert.assertEquals(partitions.get(0), new PartitionDescriptor("a", dataset));
    Assert.assertEquals(partitions.get(1), new PartitionDescriptor("1", dataset));
    writer.commit();
    Assert.assertEquals(builder.actions.size(), 2);
    action = builder.actions.poll();
    Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.COMMIT);
    action = builder.actions.poll();
    Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.COMMIT);
}
Also used : DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) State(org.apache.gobblin.configuration.State) TestPartitioner(org.apache.gobblin.writer.test.TestPartitioner) TestPartitionAwareWriterBuilder(org.apache.gobblin.writer.test.TestPartitionAwareWriterBuilder) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor) Test(org.testng.annotations.Test)

Example 8 with PartitionDescriptor

use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.

the class LineageEventTest method testEventForPartitionedDataset.

@Test
public void testEventForPartitionedDataset() {
    final String topic = "testTopic";
    final String kafka = "kafka";
    final String hdfs = "hdfs";
    final String path = "/data/tracking/PageViewEvent";
    final String partitionName = "hourly/2018/08/15/15";
    State state = new State();
    LineageInfo lineageInfo = getLineageInfo();
    DatasetDescriptor source = new DatasetDescriptor(kafka, topic);
    lineageInfo.setSource(source, state);
    DatasetDescriptor destinationDataset = new DatasetDescriptor(hdfs, path);
    PartitionDescriptor destination = new PartitionDescriptor(partitionName, destinationDataset);
    lineageInfo.putDestination(destination, 0, state);
    Map<String, Set<LineageEventBuilder>> events = LineageInfo.load(state);
    LineageEventBuilder event = first(events.get("0"));
    verify(event, topic, source, destination);
    // Verify gobblin tracking event
    GobblinTrackingEvent trackingEvent = event.build();
    Assert.assertEquals(LineageEventBuilder.isLineageEvent(trackingEvent), true);
    Assert.assertEquals(LineageEventBuilder.fromEvent(trackingEvent), event);
}
Also used : Set(java.util.Set) GobblinTrackingEvent(org.apache.gobblin.metrics.GobblinTrackingEvent) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) State(org.apache.gobblin.configuration.State) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor) Test(org.testng.annotations.Test)

Example 9 with PartitionDescriptor

use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.

the class PartitionedFileSourceBase method addLineageSourceInfo.

@Override
protected void addLineageSourceInfo(WorkUnit workUnit, State state) {
    if (!lineageInfo.isPresent()) {
        log.info("Lineage is not enabled");
        return;
    }
    String platform = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_PLATFORM, DatasetConstants.PLATFORM_HDFS);
    Path dataDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY));
    String dataset = Path.getPathWithoutSchemeAndAuthority(dataDir).toString();
    URI fileSystemUrl = URI.create(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI));
    DatasetDescriptor datasetDescriptor = new DatasetDescriptor(platform, fileSystemUrl, dataset);
    String partitionName = workUnit.getProp(ConfigurationKeys.WORK_UNIT_DATE_PARTITION_NAME);
    PartitionDescriptor descriptor = new PartitionDescriptor(partitionName, datasetDescriptor);
    lineageInfo.get().setSource(descriptor, workUnit);
}
Also used : Path(org.apache.hadoop.fs.Path) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor) URI(java.net.URI)

Example 10 with PartitionDescriptor

use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.

the class FileBasedSourceTest method verifyPartitionSourceLineage.

private void verifyPartitionSourceLineage(WorkUnit wu, Set<String> partitions, DatasetDescriptor datasetDescriptor) {
    PartitionDescriptor descriptor = (PartitionDescriptor) Descriptor.fromJson(wu.getProp(SOURCE_LINEAGE_KEY));
    Assert.assertTrue(partitions.contains(descriptor.getName()));
    Assert.assertEquals(descriptor.getDataset(), datasetDescriptor);
}
Also used : PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor)

Aggregations

PartitionDescriptor (org.apache.gobblin.dataset.PartitionDescriptor)10 DatasetDescriptor (org.apache.gobblin.dataset.DatasetDescriptor)8 Test (org.testng.annotations.Test)5 ArrayList (java.util.ArrayList)3 State (org.apache.gobblin.configuration.State)3 Path (org.apache.hadoop.fs.Path)3 Set (java.util.Set)2 Descriptor (org.apache.gobblin.dataset.Descriptor)2 Closer (com.google.common.io.Closer)1 IOException (java.io.IOException)1 URI (java.net.URI)1 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)1 CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)1 CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)1 PostPublishStep (org.apache.gobblin.data.management.copy.entities.PostPublishStep)1 PrePublishStep (org.apache.gobblin.data.management.copy.entities.PrePublishStep)1 HiveRegisterStep (org.apache.gobblin.hive.HiveRegisterStep)1 HiveSpec (org.apache.gobblin.hive.spec.HiveSpec)1 SimpleHiveSpec (org.apache.gobblin.hive.spec.SimpleHiveSpec)1 GobblinTrackingEvent (org.apache.gobblin.metrics.GobblinTrackingEvent)1