Search in sources :

Example 1 with HopKeyValueFn

use of org.apache.hop.beam.core.fn.HopKeyValueFn in project hop by apache.

the class BeamMergeJoinTransformHandler method handleTransform.

@Override
public void handleTransform(ILogChannel log, IVariables variables, IBeamPipelineEngineRunConfiguration runConfiguration, IHopMetadataProvider metadataProvider, PipelineMeta pipelineMeta, List<String> transformPluginClasses, List<String> xpPluginClasses, TransformMeta transformMeta, Map<String, PCollection<HopRow>> transformCollectionMap, Pipeline pipeline, IRowMeta rowMeta, List<TransformMeta> previousTransforms, PCollection<HopRow> input) throws HopException {
    // Don't simply case but serialize/de-serialize the metadata to prevent classloader exceptions
    // 
    MergeJoinMeta meta = new MergeJoinMeta();
    loadTransformMetadata(meta, transformMeta, metadataProvider, pipelineMeta);
    String joinType = meta.getJoinType();
    String[] leftKeys = meta.getKeyFields1().toArray(new String[0]);
    String[] rightKeys = meta.getKeyFields2().toArray(new String[0]);
    TransformMeta leftInfoTransform = meta.getTransformIOMeta().getInfoStreams().get(0).getTransformMeta();
    if (leftInfoTransform == null) {
        throw new HopException("The left source transform isn't defined in the Merge Join transform called '" + transformMeta.getName() + "'");
    }
    PCollection<HopRow> leftPCollection = transformCollectionMap.get(leftInfoTransform.getName());
    if (leftPCollection == null) {
        throw new HopException("The left source collection in the pipeline couldn't be found (probably a programming error)");
    }
    IRowMeta leftRowMeta = pipelineMeta.getTransformFields(variables, leftInfoTransform);
    TransformMeta rightInfoTransform = meta.getTransformIOMeta().getInfoStreams().get(1).getTransformMeta();
    if (rightInfoTransform == null) {
        throw new HopException("The right source transform isn't defined in the Merge Join transform called '" + transformMeta.getName() + "'");
    }
    PCollection<HopRow> rightPCollection = transformCollectionMap.get(rightInfoTransform.getName());
    if (rightPCollection == null) {
        throw new HopException("The right source collection in the pipeline couldn't be found (probably a programming error)");
    }
    IRowMeta rightRowMeta = pipelineMeta.getTransformFields(variables, rightInfoTransform);
    // Create key-value pairs (KV) for the left collections
    // 
    List<String> leftK = new ArrayList<>(Arrays.asList(leftKeys));
    IRowMeta leftKRowMeta = new RowMeta();
    List<String> leftV = new ArrayList<>();
    IRowMeta leftVRowMeta = new RowMeta();
    for (String leftKey : leftKeys) {
        leftKRowMeta.addValueMeta(leftRowMeta.searchValueMeta(leftKey).clone());
    }
    for (IValueMeta valueMeta : leftRowMeta.getValueMetaList()) {
        String valueName = valueMeta.getName();
        if (Const.indexOfString(valueName, leftKeys) < 0) {
            leftV.add(valueName);
            leftVRowMeta.addValueMeta(valueMeta.clone());
        }
    }
    HopKeyValueFn leftKVFn = new HopKeyValueFn(JsonRowMeta.toJson(leftRowMeta), transformPluginClasses, xpPluginClasses, leftK.toArray(new String[0]), leftV.toArray(new String[0]), transformMeta.getName());
    PCollection<KV<HopRow, HopRow>> leftKVPCollection = leftPCollection.apply(ParDo.of(leftKVFn));
    // Create key-value pairs (KV) for the right collections
    // 
    List<String> rightK = new ArrayList<>(Arrays.asList(rightKeys));
    IRowMeta rightKRowMeta = new RowMeta();
    List<String> rightV = new ArrayList<>();
    IRowMeta rightVRowMeta = new RowMeta();
    for (String rightKey : rightKeys) {
        rightKRowMeta.addValueMeta(rightRowMeta.searchValueMeta(rightKey).clone());
    }
    for (IValueMeta valueMeta : rightRowMeta.getValueMetaList()) {
        String valueName = valueMeta.getName();
        if (Const.indexOfString(valueName, rightKeys) < 0) {
            rightV.add(valueName);
            rightVRowMeta.addValueMeta(valueMeta.clone());
        }
    }
    HopKeyValueFn rightKVFn = new HopKeyValueFn(JsonRowMeta.toJson(rightRowMeta), transformPluginClasses, xpPluginClasses, rightK.toArray(new String[0]), rightV.toArray(new String[0]), transformMeta.getName());
    PCollection<KV<HopRow, HopRow>> rightKVPCollection = rightPCollection.apply(ParDo.of(rightKVFn));
    PCollection<KV<HopRow, KV<HopRow, HopRow>>> kvpCollection;
    Object[] leftNull = RowDataUtil.allocateRowData(leftVRowMeta.size());
    Object[] rightNull = RowDataUtil.allocateRowData(rightVRowMeta.size());
    if (MergeJoinMeta.joinTypes[0].equals(joinType)) {
        // Inner Join
        // 
        kvpCollection = Join.innerJoin(leftKVPCollection, rightKVPCollection);
    } else if (MergeJoinMeta.joinTypes[1].equals(joinType)) {
        // Left outer join
        // 
        kvpCollection = Join.leftOuterJoin(leftKVPCollection, rightKVPCollection, new HopRow(rightNull));
    } else if (MergeJoinMeta.joinTypes[2].equals(joinType)) {
        // Right outer join
        // 
        kvpCollection = Join.rightOuterJoin(leftKVPCollection, rightKVPCollection, new HopRow(leftNull));
    } else if (MergeJoinMeta.joinTypes[3].equals(joinType)) {
        // Full outer join
        // 
        kvpCollection = Join.fullOuterJoin(leftKVPCollection, rightKVPCollection, new HopRow(leftNull), new HopRow(rightNull));
    } else {
        throw new HopException("Join type '" + joinType + "' is not recognized or supported");
    }
    // This is the output of the transform, we'll try to mimic this
    // 
    final IRowMeta outputRowMeta = leftVRowMeta.clone();
    outputRowMeta.addRowMeta(leftKRowMeta.clone());
    outputRowMeta.addRowMeta(rightKRowMeta.clone());
    outputRowMeta.addRowMeta(rightVRowMeta.clone());
    // Now we need to collapse the results where we have a Key-Value pair of
    // The key (left or right depending but the same row metadata (leftKRowMeta == rightKRowMeta)
    // The key is simply a HopRow
    // The value:
    // The value is the resulting combination of the Value parts of the left and right side.
    // These can be null depending on the join type
    // So we want to grab all this information and put it back together on a single row.
    // 
    DoFn<KV<HopRow, KV<HopRow, HopRow>>, HopRow> assemblerFn = new AssemblerFn(JsonRowMeta.toJson(outputRowMeta), JsonRowMeta.toJson(leftKRowMeta), JsonRowMeta.toJson(leftVRowMeta), JsonRowMeta.toJson(rightVRowMeta), transformMeta.getName(), transformPluginClasses, xpPluginClasses);
    // Apply the transform transform to the previous io transform PCollection(s)
    // 
    PCollection<HopRow> transformPCollection = kvpCollection.apply(ParDo.of(assemblerFn));
    // Save this in the map
    // 
    transformCollectionMap.put(transformMeta.getName(), transformPCollection);
    log.logBasic("Handled Merge Join (TRANSFORM) : " + transformMeta.getName());
}
Also used : HopKeyValueFn(org.apache.hop.beam.core.fn.HopKeyValueFn) HopException(org.apache.hop.core.exception.HopException) RowMeta(org.apache.hop.core.row.RowMeta) JsonRowMeta(org.apache.hop.beam.core.util.JsonRowMeta) IRowMeta(org.apache.hop.core.row.IRowMeta) IRowMeta(org.apache.hop.core.row.IRowMeta) ArrayList(java.util.ArrayList) MergeJoinMeta(org.apache.hop.pipeline.transforms.mergejoin.MergeJoinMeta) KV(org.apache.beam.sdk.values.KV) AssemblerFn(org.apache.hop.beam.core.fn.AssemblerFn) IValueMeta(org.apache.hop.core.row.IValueMeta) TransformMeta(org.apache.hop.pipeline.transform.TransformMeta) HopRow(org.apache.hop.beam.core.HopRow)

Example 2 with HopKeyValueFn

use of org.apache.hop.beam.core.fn.HopKeyValueFn in project hop by apache.

the class GroupByTransform method expand.

@Override
public PCollection<HopRow> expand(PCollection<HopRow> input) {
    try {
        if (inputRowMeta == null) {
            BeamHop.init(transformPluginClasses, xpPluginClasses);
            inputRowMeta = JsonRowMeta.fromJson(rowMetaJson);
            groupRowMeta = new RowMeta();
            for (int i = 0; i < groupFields.length; i++) {
                groupRowMeta.addValueMeta(inputRowMeta.searchValueMeta(groupFields[i]));
            }
            subjectRowMeta = new RowMeta();
            for (int i = 0; i < subjects.length; i++) {
                subjectRowMeta.addValueMeta(inputRowMeta.searchValueMeta(subjects[i]));
            }
        }
        // Split the HopRow into GroupFields-HopRow and SubjectFields-HopRow
        // 
        PCollection<KV<HopRow, HopRow>> groupSubjects = input.apply(ParDo.of(new HopKeyValueFn(rowMetaJson, transformPluginClasses, xpPluginClasses, groupFields, subjects, transformName)));
        // Now we need to aggregate the groups with a Combine
        GroupByKey<HopRow, HopRow> byKey = GroupByKey.<HopRow, HopRow>create();
        PCollection<KV<HopRow, Iterable<HopRow>>> grouped = groupSubjects.apply(byKey);
        // Aggregate the rows in the grouped PCollection
        // Input: KV<HopRow>, Iterable<HopRow>>
        // This means that The group rows is in HopRow.  For every one of these, you get a list of
        // subject rows.
        // We need to calculate the aggregation of these subject lists
        // Then we output group values with result values behind it.
        // 
        String counterName = transformName + " AGG";
        PCollection<HopRow> output = grouped.apply(ParDo.of(new GroupByFn(counterName, JsonRowMeta.toJson(groupRowMeta), transformPluginClasses, xpPluginClasses, JsonRowMeta.toJson(subjectRowMeta), aggregations)));
        return output;
    } catch (Exception e) {
        numErrors.inc();
        LOG.error("Error in group by transform", e);
        throw new RuntimeException("Error in group by transform", e);
    }
}
Also used : HopKeyValueFn(org.apache.hop.beam.core.fn.HopKeyValueFn) IRowMeta(org.apache.hop.core.row.IRowMeta) RowMeta(org.apache.hop.core.row.RowMeta) JsonRowMeta(org.apache.hop.beam.core.util.JsonRowMeta) KV(org.apache.beam.sdk.values.KV) GroupByFn(org.apache.hop.beam.core.fn.GroupByFn) HopRow(org.apache.hop.beam.core.HopRow)

Aggregations

KV (org.apache.beam.sdk.values.KV)2 HopRow (org.apache.hop.beam.core.HopRow)2 HopKeyValueFn (org.apache.hop.beam.core.fn.HopKeyValueFn)2 JsonRowMeta (org.apache.hop.beam.core.util.JsonRowMeta)2 IRowMeta (org.apache.hop.core.row.IRowMeta)2 RowMeta (org.apache.hop.core.row.RowMeta)2 ArrayList (java.util.ArrayList)1 AssemblerFn (org.apache.hop.beam.core.fn.AssemblerFn)1 GroupByFn (org.apache.hop.beam.core.fn.GroupByFn)1 HopException (org.apache.hop.core.exception.HopException)1 IValueMeta (org.apache.hop.core.row.IValueMeta)1 TransformMeta (org.apache.hop.pipeline.transform.TransformMeta)1 MergeJoinMeta (org.apache.hop.pipeline.transforms.mergejoin.MergeJoinMeta)1