use of org.apache.hop.beam.core.fn.HopKeyValueFn in project hop by apache.
the class BeamMergeJoinTransformHandler method handleTransform.
@Override
public void handleTransform(ILogChannel log, IVariables variables, IBeamPipelineEngineRunConfiguration runConfiguration, IHopMetadataProvider metadataProvider, PipelineMeta pipelineMeta, List<String> transformPluginClasses, List<String> xpPluginClasses, TransformMeta transformMeta, Map<String, PCollection<HopRow>> transformCollectionMap, Pipeline pipeline, IRowMeta rowMeta, List<TransformMeta> previousTransforms, PCollection<HopRow> input) throws HopException {
// Don't simply case but serialize/de-serialize the metadata to prevent classloader exceptions
//
MergeJoinMeta meta = new MergeJoinMeta();
loadTransformMetadata(meta, transformMeta, metadataProvider, pipelineMeta);
String joinType = meta.getJoinType();
String[] leftKeys = meta.getKeyFields1().toArray(new String[0]);
String[] rightKeys = meta.getKeyFields2().toArray(new String[0]);
TransformMeta leftInfoTransform = meta.getTransformIOMeta().getInfoStreams().get(0).getTransformMeta();
if (leftInfoTransform == null) {
throw new HopException("The left source transform isn't defined in the Merge Join transform called '" + transformMeta.getName() + "'");
}
PCollection<HopRow> leftPCollection = transformCollectionMap.get(leftInfoTransform.getName());
if (leftPCollection == null) {
throw new HopException("The left source collection in the pipeline couldn't be found (probably a programming error)");
}
IRowMeta leftRowMeta = pipelineMeta.getTransformFields(variables, leftInfoTransform);
TransformMeta rightInfoTransform = meta.getTransformIOMeta().getInfoStreams().get(1).getTransformMeta();
if (rightInfoTransform == null) {
throw new HopException("The right source transform isn't defined in the Merge Join transform called '" + transformMeta.getName() + "'");
}
PCollection<HopRow> rightPCollection = transformCollectionMap.get(rightInfoTransform.getName());
if (rightPCollection == null) {
throw new HopException("The right source collection in the pipeline couldn't be found (probably a programming error)");
}
IRowMeta rightRowMeta = pipelineMeta.getTransformFields(variables, rightInfoTransform);
// Create key-value pairs (KV) for the left collections
//
List<String> leftK = new ArrayList<>(Arrays.asList(leftKeys));
IRowMeta leftKRowMeta = new RowMeta();
List<String> leftV = new ArrayList<>();
IRowMeta leftVRowMeta = new RowMeta();
for (String leftKey : leftKeys) {
leftKRowMeta.addValueMeta(leftRowMeta.searchValueMeta(leftKey).clone());
}
for (IValueMeta valueMeta : leftRowMeta.getValueMetaList()) {
String valueName = valueMeta.getName();
if (Const.indexOfString(valueName, leftKeys) < 0) {
leftV.add(valueName);
leftVRowMeta.addValueMeta(valueMeta.clone());
}
}
HopKeyValueFn leftKVFn = new HopKeyValueFn(JsonRowMeta.toJson(leftRowMeta), transformPluginClasses, xpPluginClasses, leftK.toArray(new String[0]), leftV.toArray(new String[0]), transformMeta.getName());
PCollection<KV<HopRow, HopRow>> leftKVPCollection = leftPCollection.apply(ParDo.of(leftKVFn));
// Create key-value pairs (KV) for the right collections
//
List<String> rightK = new ArrayList<>(Arrays.asList(rightKeys));
IRowMeta rightKRowMeta = new RowMeta();
List<String> rightV = new ArrayList<>();
IRowMeta rightVRowMeta = new RowMeta();
for (String rightKey : rightKeys) {
rightKRowMeta.addValueMeta(rightRowMeta.searchValueMeta(rightKey).clone());
}
for (IValueMeta valueMeta : rightRowMeta.getValueMetaList()) {
String valueName = valueMeta.getName();
if (Const.indexOfString(valueName, rightKeys) < 0) {
rightV.add(valueName);
rightVRowMeta.addValueMeta(valueMeta.clone());
}
}
HopKeyValueFn rightKVFn = new HopKeyValueFn(JsonRowMeta.toJson(rightRowMeta), transformPluginClasses, xpPluginClasses, rightK.toArray(new String[0]), rightV.toArray(new String[0]), transformMeta.getName());
PCollection<KV<HopRow, HopRow>> rightKVPCollection = rightPCollection.apply(ParDo.of(rightKVFn));
PCollection<KV<HopRow, KV<HopRow, HopRow>>> kvpCollection;
Object[] leftNull = RowDataUtil.allocateRowData(leftVRowMeta.size());
Object[] rightNull = RowDataUtil.allocateRowData(rightVRowMeta.size());
if (MergeJoinMeta.joinTypes[0].equals(joinType)) {
// Inner Join
//
kvpCollection = Join.innerJoin(leftKVPCollection, rightKVPCollection);
} else if (MergeJoinMeta.joinTypes[1].equals(joinType)) {
// Left outer join
//
kvpCollection = Join.leftOuterJoin(leftKVPCollection, rightKVPCollection, new HopRow(rightNull));
} else if (MergeJoinMeta.joinTypes[2].equals(joinType)) {
// Right outer join
//
kvpCollection = Join.rightOuterJoin(leftKVPCollection, rightKVPCollection, new HopRow(leftNull));
} else if (MergeJoinMeta.joinTypes[3].equals(joinType)) {
// Full outer join
//
kvpCollection = Join.fullOuterJoin(leftKVPCollection, rightKVPCollection, new HopRow(leftNull), new HopRow(rightNull));
} else {
throw new HopException("Join type '" + joinType + "' is not recognized or supported");
}
// This is the output of the transform, we'll try to mimic this
//
final IRowMeta outputRowMeta = leftVRowMeta.clone();
outputRowMeta.addRowMeta(leftKRowMeta.clone());
outputRowMeta.addRowMeta(rightKRowMeta.clone());
outputRowMeta.addRowMeta(rightVRowMeta.clone());
// Now we need to collapse the results where we have a Key-Value pair of
// The key (left or right depending but the same row metadata (leftKRowMeta == rightKRowMeta)
// The key is simply a HopRow
// The value:
// The value is the resulting combination of the Value parts of the left and right side.
// These can be null depending on the join type
// So we want to grab all this information and put it back together on a single row.
//
DoFn<KV<HopRow, KV<HopRow, HopRow>>, HopRow> assemblerFn = new AssemblerFn(JsonRowMeta.toJson(outputRowMeta), JsonRowMeta.toJson(leftKRowMeta), JsonRowMeta.toJson(leftVRowMeta), JsonRowMeta.toJson(rightVRowMeta), transformMeta.getName(), transformPluginClasses, xpPluginClasses);
// Apply the transform transform to the previous io transform PCollection(s)
//
PCollection<HopRow> transformPCollection = kvpCollection.apply(ParDo.of(assemblerFn));
// Save this in the map
//
transformCollectionMap.put(transformMeta.getName(), transformPCollection);
log.logBasic("Handled Merge Join (TRANSFORM) : " + transformMeta.getName());
}
use of org.apache.hop.beam.core.fn.HopKeyValueFn in project hop by apache.
the class GroupByTransform method expand.
@Override
public PCollection<HopRow> expand(PCollection<HopRow> input) {
try {
if (inputRowMeta == null) {
BeamHop.init(transformPluginClasses, xpPluginClasses);
inputRowMeta = JsonRowMeta.fromJson(rowMetaJson);
groupRowMeta = new RowMeta();
for (int i = 0; i < groupFields.length; i++) {
groupRowMeta.addValueMeta(inputRowMeta.searchValueMeta(groupFields[i]));
}
subjectRowMeta = new RowMeta();
for (int i = 0; i < subjects.length; i++) {
subjectRowMeta.addValueMeta(inputRowMeta.searchValueMeta(subjects[i]));
}
}
// Split the HopRow into GroupFields-HopRow and SubjectFields-HopRow
//
PCollection<KV<HopRow, HopRow>> groupSubjects = input.apply(ParDo.of(new HopKeyValueFn(rowMetaJson, transformPluginClasses, xpPluginClasses, groupFields, subjects, transformName)));
// Now we need to aggregate the groups with a Combine
GroupByKey<HopRow, HopRow> byKey = GroupByKey.<HopRow, HopRow>create();
PCollection<KV<HopRow, Iterable<HopRow>>> grouped = groupSubjects.apply(byKey);
// Aggregate the rows in the grouped PCollection
// Input: KV<HopRow>, Iterable<HopRow>>
// This means that The group rows is in HopRow. For every one of these, you get a list of
// subject rows.
// We need to calculate the aggregation of these subject lists
// Then we output group values with result values behind it.
//
String counterName = transformName + " AGG";
PCollection<HopRow> output = grouped.apply(ParDo.of(new GroupByFn(counterName, JsonRowMeta.toJson(groupRowMeta), transformPluginClasses, xpPluginClasses, JsonRowMeta.toJson(subjectRowMeta), aggregations)));
return output;
} catch (Exception e) {
numErrors.inc();
LOG.error("Error in group by transform", e);
throw new RuntimeException("Error in group by transform", e);
}
}
Aggregations