Search in sources :

Example 6 with FieldLineageInfo

use of io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo in project cdap by caskdata.

the class SparkRuntimeService method destroy.

/**
 * Calls the destroy or onFinish method of {@link ProgramLifecycle}.
 */
private void destroy(final ProgramState state) {
    context.setState(state);
    TransactionControl defaultTxControl = runtimeContext.getDefaultTxControl();
    TransactionControl txControl = spark instanceof ProgramLifecycle ? Transactions.getTransactionControl(defaultTxControl, Spark.class, spark, "destroy") : defaultTxControl;
    runtimeContext.destroyProgram(programLifecycle, txControl, false);
    if (emitFieldLineage()) {
        try {
            // here we cannot call context.flushRecord() since the WorkflowNodeState will need to record and store
            // the lineage information
            FieldLineageInfo info = new FieldLineageInfo(runtimeContext.getFieldLineageOperations());
            fieldLineageWriter.write(runtimeContext.getProgramRunId(), info);
        } catch (Throwable t) {
            LOG.warn("Failed to emit the field lineage operations for Spark {}", runtimeContext.getProgramRunId(), t);
        }
    }
}
Also used : ProgramLifecycle(io.cdap.cdap.api.ProgramLifecycle) TransactionControl(io.cdap.cdap.api.annotation.TransactionControl) Spark(io.cdap.cdap.api.spark.Spark) AbstractSpark(io.cdap.cdap.api.spark.AbstractSpark) FieldLineageInfo(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo)

Example 7 with FieldLineageInfo

use of io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo in project cdap by cdapio.

the class MapReduceRuntimeService method destroy.

/**
 * Calls the destroy method of {@link ProgramLifecycle}.
 */
private void destroy() {
    TransactionControl defaultTxControl = context.getDefaultTxControl();
    TransactionControl txControl = mapReduce instanceof ProgramLifecycle ? Transactions.getTransactionControl(defaultTxControl, MapReduce.class, mapReduce, "destroy") : defaultTxControl;
    context.destroyProgram(programLifecycle, txControl, false);
    if (emitFieldLineage()) {
        try {
            // here we cannot call context.flushRecord() since the WorkflowNodeState will need to record and store
            // the lineage information
            FieldLineageInfo info = new FieldLineageInfo(context.getFieldLineageOperations());
            fieldLineageWriter.write(mapReduceRunId, info);
        } catch (Throwable t) {
            LOG.warn("Failed to emit the field lineage operations for MapReduce {}", mapReduceRunId, t);
        }
    }
}
Also used : ProgramLifecycle(io.cdap.cdap.api.ProgramLifecycle) TransactionControl(io.cdap.cdap.api.annotation.TransactionControl) FieldLineageInfo(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo) AbstractMapReduce(io.cdap.cdap.api.mapreduce.AbstractMapReduce) MapReduce(io.cdap.cdap.api.mapreduce.MapReduce)

Example 8 with FieldLineageInfo

use of io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo in project cdap by cdapio.

the class SparkRuntimeService method destroy.

/**
 * Calls the destroy or onFinish method of {@link ProgramLifecycle}.
 */
private void destroy(final ProgramState state) {
    context.setState(state);
    TransactionControl defaultTxControl = runtimeContext.getDefaultTxControl();
    TransactionControl txControl = spark instanceof ProgramLifecycle ? Transactions.getTransactionControl(defaultTxControl, Spark.class, spark, "destroy") : defaultTxControl;
    runtimeContext.destroyProgram(programLifecycle, txControl, false);
    if (emitFieldLineage()) {
        try {
            // here we cannot call context.flushRecord() since the WorkflowNodeState will need to record and store
            // the lineage information
            FieldLineageInfo info = new FieldLineageInfo(runtimeContext.getFieldLineageOperations());
            fieldLineageWriter.write(runtimeContext.getProgramRunId(), info);
        } catch (Throwable t) {
            LOG.warn("Failed to emit the field lineage operations for Spark {}", runtimeContext.getProgramRunId(), t);
        }
    }
}
Also used : ProgramLifecycle(io.cdap.cdap.api.ProgramLifecycle) TransactionControl(io.cdap.cdap.api.annotation.TransactionControl) Spark(io.cdap.cdap.api.spark.Spark) AbstractSpark(io.cdap.cdap.api.spark.AbstractSpark) FieldLineageInfo(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo)

Example 9 with FieldLineageInfo

use of io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo in project cdap by cdapio.

the class LineageOperationProcessorTest method testAnotherSimplePipeline.

@Test
public void testAnotherSimplePipeline() {
    // n1-->n2-->n3-->n4
    // n1 => read: file -> (offset, body)
    // n2 => parse: (body) -> (first_name, last_name) | n2
    // n3 => concat: (first_name, last_name) -> (name) | n
    // n4 => write: (offset, name) -> another_file
    Set<Connection> connections = new HashSet<>();
    connections.add(new Connection("n1", "n2"));
    connections.add(new Connection("n2", "n3"));
    connections.add(new Connection("n3", "n4"));
    Map<String, List<FieldOperation>> stageOperations = new HashMap<>();
    List<FieldOperation> fieldOperations = new ArrayList<>();
    fieldOperations.add(new FieldReadOperation("read", "some read", EndPoint.of("ns", "file1"), "offset", "body"));
    stageOperations.put("n1", fieldOperations);
    fieldOperations = new ArrayList<>();
    fieldOperations.add(new FieldTransformOperation("parse", "parsing body", Collections.singletonList("body"), "first_name", "last_name"));
    stageOperations.put("n2", fieldOperations);
    fieldOperations = new ArrayList<>();
    fieldOperations.add(new FieldTransformOperation("concat", "concatinating the fields", Arrays.asList("first_name", "last_name"), "name"));
    stageOperations.put("n3", fieldOperations);
    fieldOperations = new ArrayList<>();
    fieldOperations.add(new FieldWriteOperation("write_op", "writing data to file", EndPoint.of("myns", "another_file"), Arrays.asList("offset", "name")));
    stageOperations.put("n4", fieldOperations);
    LineageOperationsProcessor processor = new LineageOperationsProcessor(connections, stageOperations, Collections.emptySet());
    Set<Operation> processedOperations = processor.process();
    ReadOperation read = new ReadOperation("n1.read", "some read", EndPoint.of("ns", "file1"), "offset", "body");
    TransformOperation parse = new TransformOperation("n2.parse", "parsing body", Collections.singletonList(InputField.of("n1.read", "body")), "first_name", "last_name");
    TransformOperation concat = new TransformOperation("n3.concat", "concatinating the fields", Arrays.asList(InputField.of("n2.parse", "first_name"), InputField.of("n2.parse", "last_name")), "name");
    WriteOperation write = new WriteOperation("n4.write_op", "writing data to file", EndPoint.of("myns", "another_file"), Arrays.asList(InputField.of("n1.read", "offset"), InputField.of("n3.concat", "name")));
    List<Operation> expectedOperations = new ArrayList<>();
    expectedOperations.add(parse);
    expectedOperations.add(concat);
    expectedOperations.add(read);
    expectedOperations.add(write);
    Assert.assertEquals(new FieldLineageInfo(expectedOperations), new FieldLineageInfo(processedOperations));
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) HashMap(java.util.HashMap) Connection(io.cdap.cdap.etl.proto.Connection) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FieldLineageInfo(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 10 with FieldLineageInfo

use of io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo in project cdap by cdapio.

the class LineageOperationProcessorTest method testDirectMerge.

@Test
public void testDirectMerge() {
    // n1--------->n3
    // |
    // n2--------->n4
    // n1 => pRead: personFile -> (offset, body)
    // n2 => hRead: hrFile -> (offset, body)
    // n1.n2.merge => n1.n2.merge: (pRead.offset, pRead.body, hRead.offset, hRead.body) -> (offset, body)
    // n3 => write1: (n1.n2.merge.offset, n1.n2.merge.body) -> testStore
    // n4 => write1: (n1.n2.merge.offset, n1.n2.merge.body) -> prodStore
    Set<Connection> connections = new HashSet<>();
    connections.add(new Connection("n1", "n3"));
    connections.add(new Connection("n1", "n4"));
    connections.add(new Connection("n2", "n3"));
    connections.add(new Connection("n2", "n4"));
    EndPoint pEndPoint = EndPoint.of("ns", "personFile");
    EndPoint hEndPoint = EndPoint.of("ns", "hrFile");
    EndPoint testEndPoint = EndPoint.of("ns", "testStore");
    EndPoint prodEndPoint = EndPoint.of("ns", "prodStore");
    Map<String, List<FieldOperation>> stageOperations = new HashMap<>();
    List<FieldOperation> fieldOperations = new ArrayList<>();
    fieldOperations.add(new FieldReadOperation("pRead", "Reading from person file", pEndPoint, "offset", "body"));
    stageOperations.put("n1", fieldOperations);
    fieldOperations = new ArrayList<>();
    fieldOperations.add(new FieldReadOperation("hRead", "Reading from hr file", hEndPoint, "offset", "body"));
    stageOperations.put("n2", fieldOperations);
    fieldOperations = new ArrayList<>();
    fieldOperations.add(new FieldWriteOperation("write1", "Writing to test store", testEndPoint, "offset", "body"));
    stageOperations.put("n3", fieldOperations);
    fieldOperations = new ArrayList<>();
    fieldOperations.add(new FieldWriteOperation("write2", "Writing to prod store", prodEndPoint, "offset", "body"));
    stageOperations.put("n4", fieldOperations);
    LineageOperationsProcessor processor = new LineageOperationsProcessor(connections, stageOperations, Collections.emptySet());
    Set<Operation> processedOperations = processor.process();
    Set<Operation> expectedOperations = new HashSet<>();
    ReadOperation pRead = new ReadOperation("n1.pRead", "Reading from person file", pEndPoint, "offset", "body");
    expectedOperations.add(pRead);
    ReadOperation hRead = new ReadOperation("n2.hRead", "Reading from hr file", hEndPoint, "offset", "body");
    expectedOperations.add(hRead);
    // implicit merge should be added by app
    TransformOperation merge1 = new TransformOperation("n1,n2.merge.offset", "Merged stages: n1,n2", Arrays.asList(InputField.of("n1.pRead", "offset"), InputField.of("n2.hRead", "offset")), "offset");
    TransformOperation merge2 = new TransformOperation("n1,n2.merge.body", "Merged stages: n1,n2", Arrays.asList(InputField.of("n1.pRead", "body"), InputField.of("n2.hRead", "body")), "body");
    expectedOperations.add(merge1);
    expectedOperations.add(merge2);
    WriteOperation write1 = new WriteOperation("n3.write1", "Writing to test store", testEndPoint, Arrays.asList(InputField.of("n1,n2.merge.offset", "offset"), InputField.of("n1,n2.merge.body", "body")));
    expectedOperations.add(write1);
    WriteOperation write2 = new WriteOperation("n4.write2", "Writing to prod store", prodEndPoint, Arrays.asList(InputField.of("n1,n2.merge.offset", "offset"), InputField.of("n1,n2.merge.body", "body")));
    expectedOperations.add(write2);
    Assert.assertEquals(new FieldLineageInfo(expectedOperations), new FieldLineageInfo(processedOperations));
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) HashMap(java.util.HashMap) Connection(io.cdap.cdap.etl.proto.Connection) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FieldLineageInfo(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

FieldLineageInfo (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo)18 Operation (io.cdap.cdap.api.lineage.field.Operation)12 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)12 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)12 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)12 ArrayList (java.util.ArrayList)12 HashSet (java.util.HashSet)12 Test (org.junit.Test)12 ImmutableList (com.google.common.collect.ImmutableList)8 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)8 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)8 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)8 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)8 Connection (io.cdap.cdap.etl.proto.Connection)8 HashMap (java.util.HashMap)8 List (java.util.List)8 ProgramLifecycle (io.cdap.cdap.api.ProgramLifecycle)4 TransactionControl (io.cdap.cdap.api.annotation.TransactionControl)4 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)4 LineageStoreReader (io.cdap.cdap.data2.metadata.lineage.LineageStoreReader)4