Search in sources :

Example 11 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageTableTest method generateOperations.

private List<Operation> generateOperations(boolean addAditionalField) {
    // read: file -> (offset, body)
    // parse: (body) -> (first_name, last_name)
    // concat: (first_name, last_name) -> (name)
    // write: (offset, name) -> another_file
    List<String> readOutput = new ArrayList<>();
    readOutput.add("offset");
    readOutput.add("body");
    if (addAditionalField) {
        readOutput.add("file_name");
    }
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns1", "endpoint1"), readOutput);
    TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("read", "body")), "first_name", "last_name");
    TransformOperation concat = new TransformOperation("concat", "concatinating the fields", Arrays.asList(InputField.of("parse", "first_name"), InputField.of("parse", "last_name")), "name");
    List<InputField> writeInput = new ArrayList<>();
    writeInput.add(InputField.of("read", "offset"));
    writeInput.add(InputField.of("concat", "name"));
    if (addAditionalField) {
        writeInput.add(InputField.of("read", "file_name"));
    }
    WriteOperation write = new WriteOperation("write_op", "writing data to file", EndPoint.of("myns", "another_file"), writeInput);
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(concat);
    operations.add(read);
    operations.add(write);
    return operations;
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation)

Example 12 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class FieldLineageAdminTest method testOperations.

@Test
public void testOperations() {
    FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(Collections.emptySet(), Collections.emptySet(), operations()), metadataAdmin);
    EndPoint endPoint = EndPoint.of("ns", "file");
    // input args to the getOperationDetails below does not matter since data returned is mocked
    FieldLineageDetails operationDetails = fieldLineageAdmin.getOperationDetails(Constants.FieldLineage.Direction.INCOMING, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE);
    ProgramId program1 = new ProgramId("ns", "app", ProgramType.SPARK, "sparkprogram");
    ProgramId program2 = new ProgramId("ns", "app", ProgramType.MAPREDUCE, "mrprogram");
    ProgramRunId program1Run1 = program1.run(RunIds.generate(1000));
    ProgramRunId program1Run2 = program1.run(RunIds.generate(2000));
    ProgramRunId program1Run3 = program1.run(RunIds.generate(3000));
    ProgramRunId program1Run4 = program1.run(RunIds.generate(5000));
    ProgramRunId program2Run1 = program2.run(RunIds.generate(4000));
    ProgramRunId program2Run2 = program2.run(RunIds.generate(6000));
    List<ProgramFieldOperationInfo> incomings = operationDetails.getIncoming();
    Set<ProgramFieldOperationInfo> expectedInfos = new HashSet<>();
    List<ProgramInfo> programInfos = new ArrayList<>();
    // program1Run1 and program1Run2 both generated same set of operations, however only the latest
    // run will be included in the returned list. None of the run of program2 generated these set of operations.
    programInfos.add(new ProgramInfo(program1, RunIds.getTime(program1Run2.getRun(), TimeUnit.SECONDS)));
    EndPoint endPoint1 = EndPoint.of("ns", "file");
    EndPoint endPoint2 = EndPoint.of("ns", "anotherfile");
    List<FieldOperationInfo> fieldOperationInfos = new ArrayList<>();
    // Return list should have topologically sorted operations
    fieldOperationInfos.add(new FieldOperationInfo("read", "reading file", FieldOperationInput.of(endPoint1), FieldOperationOutput.of(Arrays.asList("offset", "body"))));
    List<InputField> inputFields = new ArrayList<>();
    inputFields.add(InputField.of("read", "offset"));
    inputFields.add(InputField.of("parse", "name"));
    inputFields.add(InputField.of("parse", "address"));
    inputFields.add(InputField.of("parse", "zip"));
    fieldOperationInfos.add(new FieldOperationInfo("write", "writing file", FieldOperationInput.of(inputFields), FieldOperationOutput.of(endPoint2)));
    expectedInfos.add(new ProgramFieldOperationInfo(programInfos, fieldOperationInfos));
    programInfos = new ArrayList<>();
    // program1 and program2 both generated the next set of operations, returned list will contain the
    // only one latest run of each program and that too sorted by the last execution time.
    programInfos.add(new ProgramInfo(program2, RunIds.getTime(program2Run2.getRun(), TimeUnit.SECONDS)));
    programInfos.add(new ProgramInfo(program1, RunIds.getTime(program1Run4.getRun(), TimeUnit.SECONDS)));
    fieldOperationInfos = new ArrayList<>();
    fieldOperationInfos.add(new FieldOperationInfo("read", "reading file", FieldOperationInput.of(endPoint1), FieldOperationOutput.of(Arrays.asList("offset", "body"))));
    FieldOperationInput input = FieldOperationInput.of(Collections.singletonList(InputField.of("read", "offset")));
    FieldOperationOutput output = FieldOperationOutput.of(Collections.singletonList("offset"));
    fieldOperationInfos.add(new FieldOperationInfo("normalize", "normalizing offset", input, output));
    inputFields = new ArrayList<>();
    inputFields.add(InputField.of("normalize", "offset"));
    inputFields.add(InputField.of("parse", "name"));
    inputFields.add(InputField.of("parse", "address"));
    inputFields.add(InputField.of("parse", "zip"));
    input = FieldOperationInput.of(inputFields);
    output = FieldOperationOutput.of(endPoint2);
    fieldOperationInfos.add(new FieldOperationInfo("write", "writing file", input, output));
    expectedInfos.add(new ProgramFieldOperationInfo(programInfos, fieldOperationInfos));
    Assert.assertNotNull(incomings);
    // converting to set because ordering in different versions of operations is not guaranteed
    Assert.assertEquals(expectedInfos, new HashSet<>(incomings));
}
Also used : InputField(io.cdap.cdap.api.lineage.field.InputField) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) ProgramFieldOperationInfo(io.cdap.cdap.proto.metadata.lineage.ProgramFieldOperationInfo) ArrayList(java.util.ArrayList) FieldOperationOutput(io.cdap.cdap.proto.metadata.lineage.FieldOperationOutput) FieldOperationInput(io.cdap.cdap.proto.metadata.lineage.FieldOperationInput) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) FieldLineageDetails(io.cdap.cdap.proto.metadata.lineage.FieldLineageDetails) ProgramId(io.cdap.cdap.proto.id.ProgramId) ProgramInfo(io.cdap.cdap.proto.metadata.lineage.ProgramInfo) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) FieldOperationInfo(io.cdap.cdap.proto.metadata.lineage.FieldOperationInfo) ProgramFieldOperationInfo(io.cdap.cdap.proto.metadata.lineage.ProgramFieldOperationInfo) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 13 with InputField

use of io.cdap.cdap.api.lineage.field.InputField in project cdap by caskdata.

the class LineageOperationsProcessor method computeProcessedOperations.

/**
 * Convert the all the stage operations to the platform operation, this method will go through the pipeline in
 * topological order, so that the later stage will always know the origin of its operation.
 * If a stage has multiple inputs except joiner, implicit merge operations will be generated in order to for further
 * stages to look up the origins.
 * For joiners, the input field name should already contains the previous stage name.
 *
 * @return a {@link Map} containing the operations with key of operation name and value of the corresponding
 * platform {@link Operation}
 */
private Map<String, Operation> computeProcessedOperations() {
    Map<String, Operation> processedOperations = new HashMap<>();
    for (String stageName : topologicalOrder) {
        Set<String> stageInputs = stageDag.getNodeInputs(stageName);
        // if the stage has multiple inputs and it is not a joiner, compute the merge operations
        if (stageInputs.size() > 1 && !noMergeRequiredStages.contains(stageName)) {
            addMergeOperation(stageInputs, processedOperations);
        }
        List<FieldOperation> fieldOperations = stageOperations.get(stageName);
        for (FieldOperation fieldOperation : fieldOperations) {
            Operation newOperation = null;
            String newOperationName = prefixedName(stageName, fieldOperation.getName());
            Set<String> currentOperationOutputs = new LinkedHashSet<>();
            switch(fieldOperation.getType()) {
                case READ:
                    FieldReadOperation read = (FieldReadOperation) fieldOperation;
                    newOperation = new ReadOperation(newOperationName, read.getDescription(), read.getSource(), read.getOutputFields());
                    currentOperationOutputs.addAll(read.getOutputFields());
                    break;
                case TRANSFORM:
                    FieldTransformOperation transform = (FieldTransformOperation) fieldOperation;
                    List<InputField> inputFields = createInputFields(transform.getInputFields(), stageName, processedOperations);
                    newOperation = new TransformOperation(newOperationName, transform.getDescription(), inputFields, transform.getOutputFields());
                    currentOperationOutputs.addAll(transform.getOutputFields());
                    break;
                case WRITE:
                    FieldWriteOperation write = (FieldWriteOperation) fieldOperation;
                    inputFields = createInputFields(write.getInputFields(), stageName, processedOperations);
                    newOperation = new WriteOperation(newOperationName, write.getDescription(), write.getSink(), inputFields);
                    break;
            }
            for (String currentOperationOutput : currentOperationOutputs) {
                // For all fields outputted by the current operation assign the operation name as origin
                // If the field appears in the output again for some other operation belonging to the same stage,
                // its origin will get updated to the new operation
                stageOutputsWithOrigins.get(stageName).put(currentOperationOutput, newOperation.getName());
            }
            processedOperations.put(newOperation.getName(), newOperation);
        }
    }
    return processedOperations;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation)

Aggregations

InputField (io.cdap.cdap.api.lineage.field.InputField)13 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)12 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)11 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)10 Operation (io.cdap.cdap.api.lineage.field.Operation)9 ArrayList (java.util.ArrayList)8 HashSet (java.util.HashSet)8 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)6 HashMap (java.util.HashMap)5 LinkedHashSet (java.util.LinkedHashSet)4 Set (java.util.Set)4 Test (org.junit.Test)4 Sets (com.google.common.collect.Sets)2 Charsets (com.google.common.base.Charsets)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 Gson (com.google.gson.Gson)1 GsonBuilder (com.google.gson.GsonBuilder)1 OperationType (io.cdap.cdap.api.lineage.field.OperationType)1 Checksums (io.cdap.cdap.common.utils.Checksums)1 EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)1