use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.
the class PartitionedDataWriter method serializePartitionInfoToState.
/**
* Serialize partitions info to {@link #state} if they are any
*/
private void serializePartitionInfoToState() {
List<PartitionDescriptor> descriptors = new ArrayList<>();
for (DataWriter writer : partitionWriters.asMap().values()) {
Descriptor descriptor = writer.getDataDescriptor();
if (null == descriptor) {
log.warn("Drop partition info as writer {} returns a null PartitionDescriptor", writer.toString());
continue;
}
if (!(descriptor instanceof PartitionDescriptor)) {
log.warn("Drop partition info as writer {} does not return a PartitionDescriptor", writer.toString());
continue;
}
descriptors.add((PartitionDescriptor) descriptor);
}
if (descriptors.size() > 0) {
state.setProp(getPartitionsKey(branchId), PartitionDescriptor.toPartitionJsonList(descriptors));
} else {
log.info("Partitions info not available. Will not serialize partitions");
}
}
use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.
the class PartitionedWriterTest method test.
@Test
public void test() throws IOException {
State state = new State();
state.setProp(ConfigurationKeys.WRITER_PARTITIONER_CLASS, TestPartitioner.class.getCanonicalName());
TestPartitionAwareWriterBuilder builder = new TestPartitionAwareWriterBuilder();
PartitionedDataWriter writer = new PartitionedDataWriter<String, String>(builder, state);
Assert.assertEquals(builder.actions.size(), 0);
String record1 = "abc";
writer.writeEnvelope(new RecordEnvelope(record1));
Assert.assertEquals(builder.actions.size(), 2);
TestPartitionAwareWriterBuilder.Action action = builder.actions.poll();
Assert.assertEquals(action.getPartition(), "a");
Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.BUILD);
action = builder.actions.poll();
Assert.assertEquals(action.getPartition(), "a");
Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.WRITE);
Assert.assertEquals(action.getTarget(), record1);
Assert.assertTrue(writer.isSpeculativeAttemptSafe());
String record2 = "123";
writer.writeEnvelope(new RecordEnvelope(record2));
Assert.assertEquals(builder.actions.size(), 2);
action = builder.actions.poll();
Assert.assertEquals(action.getPartition(), "1");
Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.BUILD);
Assert.assertFalse(writer.isSpeculativeAttemptSafe());
action = builder.actions.poll();
Assert.assertEquals(action.getPartition(), "1");
Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.WRITE);
Assert.assertEquals(action.getTarget(), record2);
writer.writeEnvelope(new RecordEnvelope(record1));
Assert.assertEquals(builder.actions.size(), 1);
action = builder.actions.poll();
Assert.assertEquals(action.getPartition(), "a");
Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.WRITE);
Assert.assertEquals(action.getTarget(), record1);
Assert.assertEquals(writer.recordsWritten(), 3);
Assert.assertEquals(writer.bytesWritten(), 3);
Assert.assertFalse(writer.isSpeculativeAttemptSafe());
writer.cleanup();
Assert.assertEquals(builder.actions.size(), 2);
action = builder.actions.poll();
Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.CLEANUP);
action = builder.actions.poll();
Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.CLEANUP);
// Before close, partitions info is not serialized
String partitionsKey = "writer.0.partitions";
Assert.assertTrue(state.getProp(partitionsKey) == null);
writer.close();
Assert.assertEquals(builder.actions.size(), 2);
action = builder.actions.poll();
Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.CLOSE);
action = builder.actions.poll();
Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.CLOSE);
// After close, partitions info is available
Assert.assertFalse(Strings.isNullOrEmpty(state.getProp(partitionsKey)));
List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, 0);
Assert.assertTrue(state.getProp(partitionsKey) == null);
Assert.assertEquals(partitions.size(), 2);
DatasetDescriptor dataset = new DatasetDescriptor("testPlatform", "testDataset");
Assert.assertEquals(partitions.get(0), new PartitionDescriptor("a", dataset));
Assert.assertEquals(partitions.get(1), new PartitionDescriptor("1", dataset));
writer.commit();
Assert.assertEquals(builder.actions.size(), 2);
action = builder.actions.poll();
Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.COMMIT);
action = builder.actions.poll();
Assert.assertEquals(action.getType(), TestPartitionAwareWriterBuilder.Actions.COMMIT);
}
use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.
the class LineageEventTest method testEventForPartitionedDataset.
@Test
public void testEventForPartitionedDataset() {
final String topic = "testTopic";
final String kafka = "kafka";
final String hdfs = "hdfs";
final String path = "/data/tracking/PageViewEvent";
final String partitionName = "hourly/2018/08/15/15";
State state = new State();
LineageInfo lineageInfo = getLineageInfo();
DatasetDescriptor source = new DatasetDescriptor(kafka, topic);
lineageInfo.setSource(source, state);
DatasetDescriptor destinationDataset = new DatasetDescriptor(hdfs, path);
PartitionDescriptor destination = new PartitionDescriptor(partitionName, destinationDataset);
lineageInfo.putDestination(destination, 0, state);
Map<String, Set<LineageEventBuilder>> events = LineageInfo.load(state);
LineageEventBuilder event = first(events.get("0"));
verify(event, topic, source, destination);
// Verify gobblin tracking event
GobblinTrackingEvent trackingEvent = event.build();
Assert.assertEquals(LineageEventBuilder.isLineageEvent(trackingEvent), true);
Assert.assertEquals(LineageEventBuilder.fromEvent(trackingEvent), event);
}
use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.
the class PartitionedFileSourceBase method addLineageSourceInfo.
@Override
protected void addLineageSourceInfo(WorkUnit workUnit, State state) {
if (!lineageInfo.isPresent()) {
log.info("Lineage is not enabled");
return;
}
String platform = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_PLATFORM, DatasetConstants.PLATFORM_HDFS);
Path dataDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY));
String dataset = Path.getPathWithoutSchemeAndAuthority(dataDir).toString();
URI fileSystemUrl = URI.create(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI));
DatasetDescriptor datasetDescriptor = new DatasetDescriptor(platform, fileSystemUrl, dataset);
String partitionName = workUnit.getProp(ConfigurationKeys.WORK_UNIT_DATE_PARTITION_NAME);
PartitionDescriptor descriptor = new PartitionDescriptor(partitionName, datasetDescriptor);
lineageInfo.get().setSource(descriptor, workUnit);
}
use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.
the class FileBasedSourceTest method verifyPartitionSourceLineage.
private void verifyPartitionSourceLineage(WorkUnit wu, Set<String> partitions, DatasetDescriptor datasetDescriptor) {
PartitionDescriptor descriptor = (PartitionDescriptor) Descriptor.fromJson(wu.getProp(SOURCE_LINEAGE_KEY));
Assert.assertTrue(partitions.contains(descriptor.getName()));
Assert.assertEquals(descriptor.getDataset(), datasetDescriptor);
}
Aggregations