Search in sources :

Example 1 with TransformBatchTransform

use of org.apache.hop.beam.core.transform.TransformBatchTransform in project hop by apache.

the class BeamGenericTransformHandler method handleTransform.

@Override
public void handleTransform(ILogChannel log, IVariables variables, IBeamPipelineEngineRunConfiguration runConfiguration, IHopMetadataProvider metadataProvider, PipelineMeta pipelineMeta, List<String> transformPluginClasses, List<String> xpPluginClasses, TransformMeta transformMeta, Map<String, PCollection<HopRow>> transformCollectionMap, Pipeline pipeline, IRowMeta rowMeta, List<TransformMeta> previousTransforms, PCollection<HopRow> input) throws HopException {
    // If we have no previous transform, it's an input transform.  We need to start from pipeline
    // 
    boolean inputTransform = input == null;
    boolean reduceParallelism = checkTransformCopiesForReducedParallelism(transformMeta);
    reduceParallelism = reduceParallelism || needsSingleThreading(transformMeta);
    String transformMetaInterfaceXml = XmlHandler.openTag(TransformMeta.XML_TAG) + transformMeta.getTransform().getXml() + XmlHandler.closeTag(TransformMeta.XML_TAG);
    // See if the transform has Info transforms
    // 
    List<TransformMeta> infoTransformMetas = pipelineMeta.findPreviousTransforms(transformMeta, true);
    List<String> infoTransforms = new ArrayList<>();
    List<String> infoRowMetaJsons = new ArrayList<>();
    List<PCollectionView<List<HopRow>>> infoCollectionViews = new ArrayList<>();
    for (TransformMeta infoTransformMeta : infoTransformMetas) {
        if (!previousTransforms.contains(infoTransformMeta)) {
            infoTransforms.add(infoTransformMeta.getName());
            infoRowMetaJsons.add(JsonRowMeta.toJson(pipelineMeta.getTransformFields(variables, infoTransformMeta)));
            PCollection<HopRow> infoCollection = transformCollectionMap.get(infoTransformMeta.getName());
            if (infoCollection == null) {
                throw new HopException("Unable to find collection for transform '" + infoTransformMeta.getName() + " providing info for '" + transformMeta.getName() + "'");
            }
            infoCollectionViews.add(infoCollection.apply(View.asList()));
        }
    }
    // Get the list of variables from the PipelineMeta variable variables:
    // 
    List<VariableValue> variableValues = getVariableValues(variables);
    // Find out all the target transforms for this transform...
    // 
    ITransformIOMeta ioMeta = transformMeta.getTransform().getTransformIOMeta();
    List<String> targetTransforms = new ArrayList<>();
    for (IStream targetStream : ioMeta.getTargetStreams()) {
        if (targetStream.getTransformMeta() != null) {
            targetTransforms.add(targetStream.getTransformMeta().getName());
        }
    }
    // For streaming pipelines we need to flush the rows in the buffer of a generic transform (Table
    // Output, Neo4j Output, ...)
    // This is what the BeamJobConfig option "Streaming Hop Transforms Flush Interval" is for...
    // Without a valid value we default to -1 to disable flushing.
    // 
    int flushIntervalMs = Const.toInt(runConfiguration.getStreamingHopTransformsFlushInterval(), -1);
    int sizeRowsSet = Const.toInt(runConfiguration.getStreamingHopTransformsBufferSize(), 500);
    // TODO: make this configurable
    // 
    int sizeRowSet = 5000;
    // Serialize the whole metastore to JSON...
    // TODO: push this method upstairs...
    // 
    String metaStoreJson = new SerializableMetadataProvider(metadataProvider).toJson();
    // Send all the information on their way to the right nodes
    // 
    PTransform<PCollection<HopRow>, PCollectionTuple> transformTransform;
    if (needsBatching(variables, transformMeta)) {
        transformTransform = new TransformBatchTransform(variableValues, metaStoreJson, transformPluginClasses, xpPluginClasses, sizeRowSet, flushIntervalMs, transformMeta.getName(), transformMeta.getTransformPluginId(), transformMetaInterfaceXml, JsonRowMeta.toJson(rowMeta), inputTransform, targetTransforms, infoTransforms, infoRowMetaJsons, infoCollectionViews);
    } else {
        transformTransform = new TransformTransform(variableValues, metaStoreJson, transformPluginClasses, xpPluginClasses, sizeRowSet, flushIntervalMs, transformMeta.getName(), transformMeta.getTransformPluginId(), transformMetaInterfaceXml, JsonRowMeta.toJson(rowMeta), inputTransform, targetTransforms, infoTransforms, infoRowMetaJsons, infoCollectionViews);
    }
    if (input == null) {
        // Start from a dummy row and group over it.
        // Trick Beam into only running a single thread of the transform that comes next.
        // 
        input = pipeline.apply(Create.of(Arrays.asList("hop-single-value"))).setCoder(StringUtf8Coder.of()).apply(WithKeys.of((Void) null)).apply(GroupByKey.create()).apply(Values.create()).apply(Flatten.iterables()).apply(ParDo.of(new StringToHopRowFn(transformMeta.getName(), JsonRowMeta.toJson(rowMeta), transformPluginClasses, xpPluginClasses)));
        // Store this new collection so we can hook up other transforms...
        // 
        String tupleId = HopBeamUtil.createMainInputTupleId(transformMeta.getName());
        transformCollectionMap.put(tupleId, input);
    } else if (reduceParallelism) {
        PCollection.IsBounded isBounded = input.isBounded();
        if (isBounded == PCollection.IsBounded.BOUNDED) {
            // group across all fields to get down to a single thread...
            // 
            input = input.apply(WithKeys.of((Void) null)).setCoder(KvCoder.of(VoidCoder.of(), input.getCoder())).apply(GroupByKey.create()).apply(Values.create()).apply(Flatten.iterables());
        } else {
            /*
        input = input
          .apply( Partition.of( 1, new SinglePartitionFn() ) )
          .apply( Flatten.pCollections() )
        ;
         */
            throw new HopException("Unable to reduce parallel in an unbounded (streaming) pipeline in transform : " + transformMeta.getName());
        }
    }
    // Apply the transform transform to the previous io transform PCollection(s)
    // 
    PCollectionTuple tuple = input.apply(transformMeta.getName(), transformTransform);
    // The main collection
    // 
    PCollection<HopRow> mainPCollection = tuple.get(new TupleTag<>(HopBeamUtil.createMainOutputTupleId(transformMeta.getName())));
    // Save this in the map
    // 
    transformCollectionMap.put(transformMeta.getName(), mainPCollection);
    // 
    for (String targetTransform : targetTransforms) {
        String tupleId = HopBeamUtil.createTargetTupleId(transformMeta.getName(), targetTransform);
        PCollection<HopRow> targetPCollection = tuple.get(new TupleTag<>(tupleId));
        // Store this in the map as well
        // 
        transformCollectionMap.put(tupleId, targetPCollection);
    }
    log.logBasic("Handled generic transform (TRANSFORM) : " + transformMeta.getName() + ", gets data from " + previousTransforms.size() + " previous transform(s), targets=" + targetTransforms.size() + ", infos=" + infoTransforms.size());
}
Also used : ArrayList(java.util.ArrayList) IStream(org.apache.hop.pipeline.transform.stream.IStream) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) HopRow(org.apache.hop.beam.core.HopRow) HopException(org.apache.hop.core.exception.HopException) VariableValue(org.apache.hop.beam.core.shared.VariableValue) TransformBatchTransform(org.apache.hop.beam.core.transform.TransformBatchTransform) StringToHopRowFn(org.apache.hop.beam.core.fn.StringToHopRowFn) PCollection(org.apache.beam.sdk.values.PCollection) PCollectionView(org.apache.beam.sdk.values.PCollectionView) ITransformIOMeta(org.apache.hop.pipeline.transform.ITransformIOMeta) SerializableMetadataProvider(org.apache.hop.core.metadata.SerializableMetadataProvider) TransformMeta(org.apache.hop.pipeline.transform.TransformMeta) TransformTransform(org.apache.hop.beam.core.transform.TransformTransform)

Aggregations

ArrayList (java.util.ArrayList)1 PCollection (org.apache.beam.sdk.values.PCollection)1 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)1 PCollectionView (org.apache.beam.sdk.values.PCollectionView)1 HopRow (org.apache.hop.beam.core.HopRow)1 StringToHopRowFn (org.apache.hop.beam.core.fn.StringToHopRowFn)1 VariableValue (org.apache.hop.beam.core.shared.VariableValue)1 TransformBatchTransform (org.apache.hop.beam.core.transform.TransformBatchTransform)1 TransformTransform (org.apache.hop.beam.core.transform.TransformTransform)1 HopException (org.apache.hop.core.exception.HopException)1 SerializableMetadataProvider (org.apache.hop.core.metadata.SerializableMetadataProvider)1 ITransformIOMeta (org.apache.hop.pipeline.transform.ITransformIOMeta)1 TransformMeta (org.apache.hop.pipeline.transform.TransformMeta)1 IStream (org.apache.hop.pipeline.transform.stream.IStream)1