Search in sources :

Example 1 with Iterator

use of scala.collection.Iterator in project kafka by apache.

the class InternalTopicIntegrationTest method getTopicConfigProperties.

private Properties getTopicConfigProperties(final String changelog) {
    // Note: You must initialize the ZkClient with ZKStringSerializer.  If you don't, then
    // createTopics() will only seem to work (it will return without error).  The topic will exist in
    // only ZooKeeper and will be returned when listing topics, but Kafka itself does not create the
    // topic.
    final ZkClient zkClient = new ZkClient(CLUSTER.zKConnectString(), DEFAULT_ZK_SESSION_TIMEOUT_MS, DEFAULT_ZK_CONNECTION_TIMEOUT_MS, ZKStringSerializer$.MODULE$);
    try {
        final boolean isSecure = false;
        final ZkUtils zkUtils = new ZkUtils(zkClient, new ZkConnection(CLUSTER.zKConnectString()), isSecure);
        final Map<String, Properties> topicConfigs = AdminUtils.fetchAllTopicConfigs(zkUtils);
        final Iterator it = topicConfigs.iterator();
        while (it.hasNext()) {
            final Tuple2<String, Properties> topicConfig = (Tuple2<String, Properties>) it.next();
            final String topic = topicConfig._1;
            final Properties prop = topicConfig._2;
            if (topic.equals(changelog)) {
                return prop;
            }
        }
        return new Properties();
    } finally {
        zkClient.close();
    }
}
Also used : ZkClient(org.I0Itec.zkclient.ZkClient) Tuple2(scala.Tuple2) Iterator(scala.collection.Iterator) ZkUtils(kafka.utils.ZkUtils) Properties(java.util.Properties) ZkConnection(org.I0Itec.zkclient.ZkConnection)

Example 2 with Iterator

use of scala.collection.Iterator in project presto by prestodb.

the class PrestoSparkTaskExecutorFactory method doCreate.

public <T extends PrestoSparkTaskOutput> IPrestoSparkTaskExecutor<T> doCreate(int partitionId, int attemptNumber, SerializedPrestoSparkTaskDescriptor serializedTaskDescriptor, Iterator<SerializedPrestoSparkTaskSource> serializedTaskSources, PrestoSparkTaskInputs inputs, CollectionAccumulator<SerializedTaskInfo> taskInfoCollector, CollectionAccumulator<PrestoSparkShuffleStats> shuffleStatsCollector, Class<T> outputType) {
    PrestoSparkTaskDescriptor taskDescriptor = taskDescriptorJsonCodec.fromJson(serializedTaskDescriptor.getBytes());
    ImmutableMap.Builder<String, TokenAuthenticator> extraAuthenticators = ImmutableMap.builder();
    authenticatorProviders.forEach(provider -> extraAuthenticators.putAll(provider.getTokenAuthenticators()));
    Session session = taskDescriptor.getSession().toSession(sessionPropertyManager, taskDescriptor.getExtraCredentials(), extraAuthenticators.build());
    PlanFragment fragment = taskDescriptor.getFragment();
    StageId stageId = new StageId(session.getQueryId(), fragment.getId().getId());
    // Clear the cache if the cache does not have broadcast table for current stageId.
    // We will only cache 1 HT at any time. If the stageId changes, we will drop the old cached HT
    prestoSparkBroadcastTableCacheManager.removeCachedTablesForStagesOtherThan(stageId);
    // TODO: include attemptId in taskId
    TaskId taskId = new TaskId(new StageExecutionId(stageId, 0), partitionId);
    List<TaskSource> taskSources = getTaskSources(serializedTaskSources);
    log.info("Task [%s] received %d splits.", taskId, taskSources.stream().mapToInt(taskSource -> taskSource.getSplits().size()).sum());
    OptionalLong totalSplitSize = computeAllSplitsSize(taskSources);
    if (totalSplitSize.isPresent()) {
        log.info("Total split size: %s bytes.", totalSplitSize.getAsLong());
    }
    // TODO: Remove this once we can display the plan on Spark UI.
    log.info(PlanPrinter.textPlanFragment(fragment, functionAndTypeManager, session, true));
    DataSize maxUserMemory = new DataSize(min(nodeMemoryConfig.getMaxQueryMemoryPerNode().toBytes(), getQueryMaxMemoryPerNode(session).toBytes()), BYTE);
    DataSize maxTotalMemory = new DataSize(min(nodeMemoryConfig.getMaxQueryTotalMemoryPerNode().toBytes(), getQueryMaxTotalMemoryPerNode(session).toBytes()), BYTE);
    DataSize maxBroadcastMemory = getSparkBroadcastJoinMaxMemoryOverride(session);
    if (maxBroadcastMemory == null) {
        maxBroadcastMemory = new DataSize(min(nodeMemoryConfig.getMaxQueryBroadcastMemory().toBytes(), getQueryMaxBroadcastMemory(session).toBytes()), BYTE);
    }
    MemoryPool memoryPool = new MemoryPool(new MemoryPoolId("spark-executor-memory-pool"), maxTotalMemory);
    SpillSpaceTracker spillSpaceTracker = new SpillSpaceTracker(maxQuerySpillPerNode);
    QueryContext queryContext = new QueryContext(session.getQueryId(), maxUserMemory, maxTotalMemory, maxBroadcastMemory, maxRevocableMemory, memoryPool, new TestingGcMonitor(), notificationExecutor, yieldExecutor, maxQuerySpillPerNode, spillSpaceTracker, memoryReservationSummaryJsonCodec);
    queryContext.setVerboseExceededMemoryLimitErrorsEnabled(isVerboseExceededMemoryLimitErrorsEnabled(session));
    queryContext.setHeapDumpOnExceededMemoryLimitEnabled(isHeapDumpOnExceededMemoryLimitEnabled(session));
    String heapDumpFilePath = Paths.get(getHeapDumpFileDirectory(session), format("%s_%s.hprof", session.getQueryId().getId(), stageId.getId())).toString();
    queryContext.setHeapDumpFilePath(heapDumpFilePath);
    TaskStateMachine taskStateMachine = new TaskStateMachine(taskId, notificationExecutor);
    TaskContext taskContext = queryContext.addTaskContext(taskStateMachine, session, // Plan has to be retained only if verbose memory exceeded errors are requested
    isVerboseExceededMemoryLimitErrorsEnabled(session) ? Optional.of(fragment.getRoot()) : Optional.empty(), perOperatorCpuTimerEnabled, cpuTimerEnabled, perOperatorAllocationTrackingEnabled, allocationTrackingEnabled, false);
    final double memoryRevokingThreshold = getMemoryRevokingThreshold(session);
    final double memoryRevokingTarget = getMemoryRevokingTarget(session);
    checkArgument(memoryRevokingTarget <= memoryRevokingThreshold, "memoryRevokingTarget should be less than or equal memoryRevokingThreshold, but got %s and %s respectively", memoryRevokingTarget, memoryRevokingThreshold);
    if (isSpillEnabled(session)) {
        memoryPool.addListener((pool, queryId, totalMemoryReservationBytes) -> {
            if (totalMemoryReservationBytes > queryContext.getPeakNodeTotalMemory()) {
                queryContext.setPeakNodeTotalMemory(totalMemoryReservationBytes);
            }
            if (totalMemoryReservationBytes > pool.getMaxBytes() * memoryRevokingThreshold && memoryRevokeRequestInProgress.compareAndSet(false, true)) {
                memoryRevocationExecutor.execute(() -> {
                    try {
                        AtomicLong remainingBytesToRevoke = new AtomicLong(totalMemoryReservationBytes - (long) (memoryRevokingTarget * pool.getMaxBytes()));
                        remainingBytesToRevoke.addAndGet(-MemoryRevokingSchedulerUtils.getMemoryAlreadyBeingRevoked(ImmutableList.of(taskContext), remainingBytesToRevoke.get()));
                        taskContext.accept(new VoidTraversingQueryContextVisitor<AtomicLong>() {

                            @Override
                            public Void visitOperatorContext(OperatorContext operatorContext, AtomicLong remainingBytesToRevoke) {
                                if (remainingBytesToRevoke.get() > 0) {
                                    long revokedBytes = operatorContext.requestMemoryRevoking();
                                    if (revokedBytes > 0) {
                                        memoryRevokePending.set(true);
                                        remainingBytesToRevoke.addAndGet(-revokedBytes);
                                    }
                                }
                                return null;
                            }
                        }, remainingBytesToRevoke);
                        memoryRevokeRequestInProgress.set(false);
                    } catch (Exception e) {
                        log.error(e, "Error requesting memory revoking");
                    }
                });
            }
            // Get the latest memory reservation info since it might have changed due to revoke
            long totalReservedMemory = pool.getQueryMemoryReservation(queryId) + pool.getQueryRevocableMemoryReservation(queryId);
            // If total memory usage is over maxTotalMemory and memory revoke request is not pending, fail the query with EXCEEDED_MEMORY_LIMIT error
            if (totalReservedMemory > maxTotalMemory.toBytes() && !memoryRevokeRequestInProgress.get() && !isMemoryRevokePending(taskContext)) {
                throw exceededLocalTotalMemoryLimit(maxTotalMemory, queryContext.getAdditionalFailureInfo(totalReservedMemory, 0) + format("Total reserved memory: %s, Total revocable memory: %s", succinctBytes(pool.getQueryMemoryReservation(queryId)), succinctBytes(pool.getQueryRevocableMemoryReservation(queryId))), isHeapDumpOnExceededMemoryLimitEnabled(session), Optional.ofNullable(heapDumpFilePath));
            }
        });
    }
    ImmutableMap.Builder<PlanNodeId, List<PrestoSparkShuffleInput>> shuffleInputs = ImmutableMap.builder();
    ImmutableMap.Builder<PlanNodeId, List<java.util.Iterator<PrestoSparkSerializedPage>>> pageInputs = ImmutableMap.builder();
    ImmutableMap.Builder<PlanNodeId, List<?>> broadcastInputs = ImmutableMap.builder();
    for (RemoteSourceNode remoteSource : fragment.getRemoteSourceNodes()) {
        List<PrestoSparkShuffleInput> remoteSourceRowInputs = new ArrayList<>();
        List<java.util.Iterator<PrestoSparkSerializedPage>> remoteSourcePageInputs = new ArrayList<>();
        List<List<?>> broadcastInputsList = new ArrayList<>();
        for (PlanFragmentId sourceFragmentId : remoteSource.getSourceFragmentIds()) {
            Iterator<Tuple2<MutablePartitionId, PrestoSparkMutableRow>> shuffleInput = inputs.getShuffleInputs().get(sourceFragmentId.toString());
            Broadcast<?> broadcastInput = inputs.getBroadcastInputs().get(sourceFragmentId.toString());
            List<PrestoSparkSerializedPage> inMemoryInput = inputs.getInMemoryInputs().get(sourceFragmentId.toString());
            if (shuffleInput != null) {
                checkArgument(broadcastInput == null, "single remote source is not expected to accept different kind of inputs");
                checkArgument(inMemoryInput == null, "single remote source is not expected to accept different kind of inputs");
                remoteSourceRowInputs.add(new PrestoSparkShuffleInput(sourceFragmentId.getId(), shuffleInput));
                continue;
            }
            if (broadcastInput != null) {
                checkArgument(inMemoryInput == null, "single remote source is not expected to accept different kind of inputs");
                // TODO: Enable NullifyingIterator once migrated to one task per JVM model
                // NullifyingIterator removes element from the list upon return
                // This allows GC to gradually reclaim memory
                // remoteSourcePageInputs.add(getNullifyingIterator(broadcastInput.value()));
                broadcastInputsList.add((List<?>) broadcastInput.value());
                continue;
            }
            if (inMemoryInput != null) {
                // for inmemory inputs pages can be released incrementally to save memory
                remoteSourcePageInputs.add(getNullifyingIterator(inMemoryInput));
                continue;
            }
            throw new IllegalArgumentException("Input not found for sourceFragmentId: " + sourceFragmentId);
        }
        if (!remoteSourceRowInputs.isEmpty()) {
            shuffleInputs.put(remoteSource.getId(), remoteSourceRowInputs);
        }
        if (!remoteSourcePageInputs.isEmpty()) {
            pageInputs.put(remoteSource.getId(), remoteSourcePageInputs);
        }
        if (!broadcastInputsList.isEmpty()) {
            broadcastInputs.put(remoteSource.getId(), broadcastInputsList);
        }
    }
    OutputBufferMemoryManager memoryManager = new OutputBufferMemoryManager(sinkMaxBufferSize.toBytes(), () -> queryContext.getTaskContextByTaskId(taskId).localSystemMemoryContext(), notificationExecutor);
    Optional<OutputPartitioning> preDeterminedPartition = Optional.empty();
    if (fragment.getPartitioningScheme().getPartitioning().getHandle().equals(FIXED_ARBITRARY_DISTRIBUTION)) {
        int partitionCount = getHashPartitionCount(session);
        preDeterminedPartition = Optional.of(new OutputPartitioning(new PreDeterminedPartitionFunction(partitionId % partitionCount, partitionCount), ImmutableList.of(), ImmutableList.of(), false, OptionalInt.empty()));
    }
    TempDataOperationContext tempDataOperationContext = new TempDataOperationContext(session.getSource(), session.getQueryId().getId(), session.getClientInfo(), Optional.of(session.getClientTags()), session.getIdentity());
    TempStorage tempStorage = tempStorageManager.getTempStorage(storageBasedBroadcastJoinStorage);
    Output<T> output = configureOutput(outputType, blockEncodingManager, memoryManager, getShuffleOutputTargetAverageRowSize(session), preDeterminedPartition, tempStorage, tempDataOperationContext, getStorageBasedBroadcastJoinWriteBufferSize(session));
    PrestoSparkOutputBuffer<?> outputBuffer = output.getOutputBuffer();
    LocalExecutionPlan localExecutionPlan = localExecutionPlanner.plan(taskContext, fragment.getRoot(), fragment.getPartitioningScheme(), fragment.getStageExecutionDescriptor(), fragment.getTableScanSchedulingOrder(), output.getOutputFactory(), new PrestoSparkRemoteSourceFactory(blockEncodingManager, shuffleInputs.build(), pageInputs.build(), broadcastInputs.build(), partitionId, shuffleStatsCollector, tempStorage, tempDataOperationContext, prestoSparkBroadcastTableCacheManager, stageId), taskDescriptor.getTableWriteInfo(), true);
    taskStateMachine.addStateChangeListener(state -> {
        if (state.isDone()) {
            outputBuffer.setNoMoreRows();
        }
    });
    PrestoSparkTaskExecution taskExecution = new PrestoSparkTaskExecution(taskStateMachine, taskContext, localExecutionPlan, taskExecutor, splitMonitor, notificationExecutor, memoryUpdateExecutor);
    taskExecution.start(taskSources);
    return new PrestoSparkTaskExecutor<>(taskContext, taskStateMachine, output.getOutputSupplier(), taskInfoCodec, taskInfoCollector, shuffleStatsCollector, executionExceptionFactory, output.getOutputBufferType(), outputBuffer, tempStorage, tempDataOperationContext);
}
Also used : StageId(com.facebook.presto.execution.StageId) ArrayList(java.util.ArrayList) PlanFragment(com.facebook.presto.sql.planner.PlanFragment) TaskStateMachine(com.facebook.presto.execution.TaskStateMachine) PlanNodeId(com.facebook.presto.spi.plan.PlanNodeId) RemoteSourceNode(com.facebook.presto.sql.planner.plan.RemoteSourceNode) DataSize(io.airlift.units.DataSize) OperatorContext(com.facebook.presto.operator.OperatorContext) OutputBufferMemoryManager(com.facebook.presto.execution.buffer.OutputBufferMemoryManager) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) TempDataOperationContext(com.facebook.presto.spi.storage.TempDataOperationContext) PrestoSparkSessionProperties.getSparkBroadcastJoinMaxMemoryOverride(com.facebook.presto.spark.PrestoSparkSessionProperties.getSparkBroadcastJoinMaxMemoryOverride) PreDeterminedPartitionFunction(com.facebook.presto.spark.execution.PrestoSparkRowOutputOperator.PreDeterminedPartitionFunction) IPrestoSparkTaskExecutor(com.facebook.presto.spark.classloader_interface.IPrestoSparkTaskExecutor) ImmutableMap(com.google.common.collect.ImmutableMap) TokenAuthenticator(com.facebook.presto.spi.security.TokenAuthenticator) SerializedPrestoSparkTaskDescriptor(com.facebook.presto.spark.classloader_interface.SerializedPrestoSparkTaskDescriptor) PrestoSparkTaskDescriptor(com.facebook.presto.spark.PrestoSparkTaskDescriptor) TaskId(com.facebook.presto.execution.TaskId) StageExecutionId(com.facebook.presto.execution.StageExecutionId) PrestoSparkUtils.getNullifyingIterator(com.facebook.presto.spark.util.PrestoSparkUtils.getNullifyingIterator) AbstractIterator(scala.collection.AbstractIterator) Iterator(scala.collection.Iterator) TestingGcMonitor(com.facebook.airlift.stats.TestingGcMonitor) PlanFragmentId(com.facebook.presto.sql.planner.plan.PlanFragmentId) MemoryPoolId(com.facebook.presto.spi.memory.MemoryPoolId) SpillSpaceTracker(com.facebook.presto.spiller.SpillSpaceTracker) TaskContext(com.facebook.presto.operator.TaskContext) QueryContext(com.facebook.presto.memory.QueryContext) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) NoSuchElementException(java.util.NoSuchElementException) PrestoSparkUtils.toPrestoSparkSerializedPage(com.facebook.presto.spark.util.PrestoSparkUtils.toPrestoSparkSerializedPage) PrestoSparkSerializedPage(com.facebook.presto.spark.classloader_interface.PrestoSparkSerializedPage) LocalExecutionPlan(com.facebook.presto.sql.planner.LocalExecutionPlanner.LocalExecutionPlan) AtomicLong(java.util.concurrent.atomic.AtomicLong) TempStorage(com.facebook.presto.spi.storage.TempStorage) Tuple2(scala.Tuple2) OptionalLong(java.util.OptionalLong) SerializedPrestoSparkTaskSource(com.facebook.presto.spark.classloader_interface.SerializedPrestoSparkTaskSource) TaskSource(com.facebook.presto.execution.TaskSource) OutputPartitioning(com.facebook.presto.sql.planner.OutputPartitioning) Session(com.facebook.presto.Session) MemoryPool(com.facebook.presto.memory.MemoryPool)

Example 3 with Iterator

use of scala.collection.Iterator in project incubator-systemml by apache.

the class MLContextFrameTest method testFrame.

public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE inputType, IO_TYPE outputType) {
    System.out.println("MLContextTest - Frame JavaRDD<String> for format: " + format + " Script: " + script_type);
    List<String> listA = new ArrayList<String>();
    List<String> listB = new ArrayList<String>();
    FrameMetadata fmA = null, fmB = null;
    Script script = null;
    ValueType[] schemaA = { ValueType.INT, ValueType.STRING, ValueType.DOUBLE, ValueType.BOOLEAN };
    List<ValueType> lschemaA = Arrays.asList(schemaA);
    FrameSchema fschemaA = new FrameSchema(lschemaA);
    ValueType[] schemaB = { ValueType.STRING, ValueType.DOUBLE, ValueType.BOOLEAN };
    List<ValueType> lschemaB = Arrays.asList(schemaB);
    FrameSchema fschemaB = new FrameSchema(lschemaB);
    if (inputType != IO_TYPE.FILE) {
        if (format == FrameFormat.CSV) {
            listA.add("1,Str2,3.0,true");
            listA.add("4,Str5,6.0,false");
            listA.add("7,Str8,9.0,true");
            listB.add("Str12,13.0,true");
            listB.add("Str25,26.0,false");
            fmA = new FrameMetadata(FrameFormat.CSV, fschemaA, 3, 4);
            fmB = new FrameMetadata(FrameFormat.CSV, fschemaB, 2, 3);
        } else if (format == FrameFormat.IJV) {
            listA.add("1 1 1");
            listA.add("1 2 Str2");
            listA.add("1 3 3.0");
            listA.add("1 4 true");
            listA.add("2 1 4");
            listA.add("2 2 Str5");
            listA.add("2 3 6.0");
            listA.add("2 4 false");
            listA.add("3 1 7");
            listA.add("3 2 Str8");
            listA.add("3 3 9.0");
            listA.add("3 4 true");
            listB.add("1 1 Str12");
            listB.add("1 2 13.0");
            listB.add("1 3 true");
            listB.add("2 1 Str25");
            listB.add("2 2 26.0");
            listB.add("2 3 false");
            fmA = new FrameMetadata(FrameFormat.IJV, fschemaA, 3, 4);
            fmB = new FrameMetadata(FrameFormat.IJV, fschemaB, 2, 3);
        }
        JavaRDD<String> javaRDDA = sc.parallelize(listA);
        JavaRDD<String> javaRDDB = sc.parallelize(listB);
        if (inputType == IO_TYPE.DATAFRAME) {
            JavaRDD<Row> javaRddRowA = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDA, CSV_DELIM, schemaA);
            JavaRDD<Row> javaRddRowB = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDB, CSV_DELIM, schemaB);
            // Create DataFrame
            StructType dfSchemaA = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaA, false);
            Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, dfSchemaA);
            StructType dfSchemaB = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaB, false);
            Dataset<Row> dataFrameB = spark.createDataFrame(javaRddRowB, dfSchemaB);
            if (script_type == SCRIPT_TYPE.DML)
                script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", dataFrameA, fmA).in("B", dataFrameB, fmB).out("A").out("C");
            else if (script_type == SCRIPT_TYPE.PYDML)
                // DO NOT USE ; at the end of any statment, it throws NPE
                script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", dataFrameA, fmA).in("B", dataFrameB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
        } else {
            if (inputType == IO_TYPE.JAVA_RDD_STR_CSV || inputType == IO_TYPE.JAVA_RDD_STR_IJV) {
                if (script_type == SCRIPT_TYPE.DML)
                    script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", javaRDDA, fmA).in("B", javaRDDB, fmB).out("A").out("C");
                else if (script_type == SCRIPT_TYPE.PYDML)
                    // DO NOT USE ; at the end of any statment, it throws
                    // NPE
                    script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", javaRDDA, fmA).in("B", javaRDDB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
            } else if (inputType == IO_TYPE.RDD_STR_CSV || inputType == IO_TYPE.RDD_STR_IJV) {
                RDD<String> rddA = JavaRDD.toRDD(javaRDDA);
                RDD<String> rddB = JavaRDD.toRDD(javaRDDB);
                if (script_type == SCRIPT_TYPE.DML)
                    script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", rddA, fmA).in("B", rddB, fmB).out("A").out("C");
                else if (script_type == SCRIPT_TYPE.PYDML)
                    // DO NOT USE ; at the end of any statment, it throws
                    // NPE
                    script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", rddA, fmA).in("B", rddB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
            }
        }
    } else {
        // Input type is file
        String fileA = null, fileB = null;
        if (format == FrameFormat.CSV) {
            fileA = baseDirectory + File.separator + "FrameA.csv";
            fileB = baseDirectory + File.separator + "FrameB.csv";
        } else if (format == FrameFormat.IJV) {
            fileA = baseDirectory + File.separator + "FrameA.ijv";
            fileB = baseDirectory + File.separator + "FrameB.ijv";
        }
        if (script_type == SCRIPT_TYPE.DML)
            script = dml("A=read($A); B=read($B);A[2:3,2:4]=B;C=A[2:3,2:3];A[1,1]=234").in("$A", fileA, fmA).in("$B", fileB, fmB).out("A").out("C");
        else if (script_type == SCRIPT_TYPE.PYDML)
            // DO NOT USE ; at the end of any statment, it throws NPE
            script = pydml("A=load($A)\nB=load($B)\nA[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("$A", fileA).in("$B", fileB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
    }
    MLResults mlResults = ml.execute(script);
    // Validate output schema
    List<ValueType> lschemaOutA = Arrays.asList(mlResults.getFrameObject("A").getSchema());
    List<ValueType> lschemaOutC = Arrays.asList(mlResults.getFrameObject("C").getSchema());
    Assert.assertEquals(ValueType.INT, lschemaOutA.get(0));
    Assert.assertEquals(ValueType.STRING, lschemaOutA.get(1));
    Assert.assertEquals(ValueType.DOUBLE, lschemaOutA.get(2));
    Assert.assertEquals(ValueType.BOOLEAN, lschemaOutA.get(3));
    Assert.assertEquals(ValueType.STRING, lschemaOutC.get(0));
    Assert.assertEquals(ValueType.DOUBLE, lschemaOutC.get(1));
    if (outputType == IO_TYPE.JAVA_RDD_STR_CSV) {
        JavaRDD<String> javaRDDStringCSVA = mlResults.getJavaRDDStringCSV("A");
        List<String> linesA = javaRDDStringCSVA.collect();
        Assert.assertEquals("1,Str2,3.0,true", linesA.get(0));
        Assert.assertEquals("4,Str12,13.0,true", linesA.get(1));
        Assert.assertEquals("7,Str25,26.0,false", linesA.get(2));
        JavaRDD<String> javaRDDStringCSVC = mlResults.getJavaRDDStringCSV("C");
        List<String> linesC = javaRDDStringCSVC.collect();
        Assert.assertEquals("Str12,13.0", linesC.get(0));
        Assert.assertEquals("Str25,26.0", linesC.get(1));
    } else if (outputType == IO_TYPE.JAVA_RDD_STR_IJV) {
        JavaRDD<String> javaRDDStringIJVA = mlResults.getJavaRDDStringIJV("A");
        List<String> linesA = javaRDDStringIJVA.collect();
        Assert.assertEquals("1 1 1", linesA.get(0));
        Assert.assertEquals("1 2 Str2", linesA.get(1));
        Assert.assertEquals("1 3 3.0", linesA.get(2));
        Assert.assertEquals("1 4 true", linesA.get(3));
        Assert.assertEquals("2 1 4", linesA.get(4));
        Assert.assertEquals("2 2 Str12", linesA.get(5));
        Assert.assertEquals("2 3 13.0", linesA.get(6));
        Assert.assertEquals("2 4 true", linesA.get(7));
        JavaRDD<String> javaRDDStringIJVC = mlResults.getJavaRDDStringIJV("C");
        List<String> linesC = javaRDDStringIJVC.collect();
        Assert.assertEquals("1 1 Str12", linesC.get(0));
        Assert.assertEquals("1 2 13.0", linesC.get(1));
        Assert.assertEquals("2 1 Str25", linesC.get(2));
        Assert.assertEquals("2 2 26.0", linesC.get(3));
    } else if (outputType == IO_TYPE.RDD_STR_CSV) {
        RDD<String> rddStringCSVA = mlResults.getRDDStringCSV("A");
        Iterator<String> iteratorA = rddStringCSVA.toLocalIterator();
        Assert.assertEquals("1,Str2,3.0,true", iteratorA.next());
        Assert.assertEquals("4,Str12,13.0,true", iteratorA.next());
        Assert.assertEquals("7,Str25,26.0,false", iteratorA.next());
        RDD<String> rddStringCSVC = mlResults.getRDDStringCSV("C");
        Iterator<String> iteratorC = rddStringCSVC.toLocalIterator();
        Assert.assertEquals("Str12,13.0", iteratorC.next());
        Assert.assertEquals("Str25,26.0", iteratorC.next());
    } else if (outputType == IO_TYPE.RDD_STR_IJV) {
        RDD<String> rddStringIJVA = mlResults.getRDDStringIJV("A");
        Iterator<String> iteratorA = rddStringIJVA.toLocalIterator();
        Assert.assertEquals("1 1 1", iteratorA.next());
        Assert.assertEquals("1 2 Str2", iteratorA.next());
        Assert.assertEquals("1 3 3.0", iteratorA.next());
        Assert.assertEquals("1 4 true", iteratorA.next());
        Assert.assertEquals("2 1 4", iteratorA.next());
        Assert.assertEquals("2 2 Str12", iteratorA.next());
        Assert.assertEquals("2 3 13.0", iteratorA.next());
        Assert.assertEquals("2 4 true", iteratorA.next());
        Assert.assertEquals("3 1 7", iteratorA.next());
        Assert.assertEquals("3 2 Str25", iteratorA.next());
        Assert.assertEquals("3 3 26.0", iteratorA.next());
        Assert.assertEquals("3 4 false", iteratorA.next());
        RDD<String> rddStringIJVC = mlResults.getRDDStringIJV("C");
        Iterator<String> iteratorC = rddStringIJVC.toLocalIterator();
        Assert.assertEquals("1 1 Str12", iteratorC.next());
        Assert.assertEquals("1 2 13.0", iteratorC.next());
        Assert.assertEquals("2 1 Str25", iteratorC.next());
        Assert.assertEquals("2 2 26.0", iteratorC.next());
    } else if (outputType == IO_TYPE.DATAFRAME) {
        Dataset<Row> dataFrameA = mlResults.getDataFrame("A").drop(RDDConverterUtils.DF_ID_COLUMN);
        StructType dfschemaA = dataFrameA.schema();
        StructField structTypeA = dfschemaA.apply(0);
        Assert.assertEquals(DataTypes.LongType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(1);
        Assert.assertEquals(DataTypes.StringType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(2);
        Assert.assertEquals(DataTypes.DoubleType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(3);
        Assert.assertEquals(DataTypes.BooleanType, structTypeA.dataType());
        List<Row> listAOut = dataFrameA.collectAsList();
        Row row1 = listAOut.get(0);
        Assert.assertEquals("Mismatch with expected value", Long.valueOf(1), row1.get(0));
        Assert.assertEquals("Mismatch with expected value", "Str2", row1.get(1));
        Assert.assertEquals("Mismatch with expected value", 3.0, row1.get(2));
        Assert.assertEquals("Mismatch with expected value", true, row1.get(3));
        Row row2 = listAOut.get(1);
        Assert.assertEquals("Mismatch with expected value", Long.valueOf(4), row2.get(0));
        Assert.assertEquals("Mismatch with expected value", "Str12", row2.get(1));
        Assert.assertEquals("Mismatch with expected value", 13.0, row2.get(2));
        Assert.assertEquals("Mismatch with expected value", true, row2.get(3));
        Dataset<Row> dataFrameC = mlResults.getDataFrame("C").drop(RDDConverterUtils.DF_ID_COLUMN);
        StructType dfschemaC = dataFrameC.schema();
        StructField structTypeC = dfschemaC.apply(0);
        Assert.assertEquals(DataTypes.StringType, structTypeC.dataType());
        structTypeC = dfschemaC.apply(1);
        Assert.assertEquals(DataTypes.DoubleType, structTypeC.dataType());
        List<Row> listCOut = dataFrameC.collectAsList();
        Row row3 = listCOut.get(0);
        Assert.assertEquals("Mismatch with expected value", "Str12", row3.get(0));
        Assert.assertEquals("Mismatch with expected value", 13.0, row3.get(1));
        Row row4 = listCOut.get(1);
        Assert.assertEquals("Mismatch with expected value", "Str25", row4.get(0));
        Assert.assertEquals("Mismatch with expected value", 26.0, row4.get(1));
    } else {
        String[][] frameA = mlResults.getFrameAs2DStringArray("A");
        Assert.assertEquals("Str2", frameA[0][1]);
        Assert.assertEquals("3.0", frameA[0][2]);
        Assert.assertEquals("13.0", frameA[1][2]);
        Assert.assertEquals("true", frameA[1][3]);
        Assert.assertEquals("Str25", frameA[2][1]);
        String[][] frameC = mlResults.getFrameAs2DStringArray("C");
        Assert.assertEquals("Str12", frameC[0][0]);
        Assert.assertEquals("Str25", frameC[1][0]);
        Assert.assertEquals("13.0", frameC[0][1]);
        Assert.assertEquals("26.0", frameC[1][1]);
    }
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MLResults(org.apache.sysml.api.mlcontext.MLResults) ArrayList(java.util.ArrayList) FrameSchema(org.apache.sysml.api.mlcontext.FrameSchema) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) RDD(org.apache.spark.rdd.RDD) StructField(org.apache.spark.sql.types.StructField) Iterator(scala.collection.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) Row(org.apache.spark.sql.Row) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata)

Example 4 with Iterator

use of scala.collection.Iterator in project systemml by apache.

the class MLContextFrameTest method testFrame.

public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE inputType, IO_TYPE outputType) {
    System.out.println("MLContextTest - Frame JavaRDD<String> for format: " + format + " Script: " + script_type);
    List<String> listA = new ArrayList<String>();
    List<String> listB = new ArrayList<String>();
    FrameMetadata fmA = null, fmB = null;
    Script script = null;
    ValueType[] schemaA = { ValueType.INT, ValueType.STRING, ValueType.DOUBLE, ValueType.BOOLEAN };
    List<ValueType> lschemaA = Arrays.asList(schemaA);
    FrameSchema fschemaA = new FrameSchema(lschemaA);
    ValueType[] schemaB = { ValueType.STRING, ValueType.DOUBLE, ValueType.BOOLEAN };
    List<ValueType> lschemaB = Arrays.asList(schemaB);
    FrameSchema fschemaB = new FrameSchema(lschemaB);
    if (inputType != IO_TYPE.FILE) {
        if (format == FrameFormat.CSV) {
            listA.add("1,Str2,3.0,true");
            listA.add("4,Str5,6.0,false");
            listA.add("7,Str8,9.0,true");
            listB.add("Str12,13.0,true");
            listB.add("Str25,26.0,false");
            fmA = new FrameMetadata(FrameFormat.CSV, fschemaA, 3, 4);
            fmB = new FrameMetadata(FrameFormat.CSV, fschemaB, 2, 3);
        } else if (format == FrameFormat.IJV) {
            listA.add("1 1 1");
            listA.add("1 2 Str2");
            listA.add("1 3 3.0");
            listA.add("1 4 true");
            listA.add("2 1 4");
            listA.add("2 2 Str5");
            listA.add("2 3 6.0");
            listA.add("2 4 false");
            listA.add("3 1 7");
            listA.add("3 2 Str8");
            listA.add("3 3 9.0");
            listA.add("3 4 true");
            listB.add("1 1 Str12");
            listB.add("1 2 13.0");
            listB.add("1 3 true");
            listB.add("2 1 Str25");
            listB.add("2 2 26.0");
            listB.add("2 3 false");
            fmA = new FrameMetadata(FrameFormat.IJV, fschemaA, 3, 4);
            fmB = new FrameMetadata(FrameFormat.IJV, fschemaB, 2, 3);
        }
        JavaRDD<String> javaRDDA = sc.parallelize(listA);
        JavaRDD<String> javaRDDB = sc.parallelize(listB);
        if (inputType == IO_TYPE.DATAFRAME) {
            JavaRDD<Row> javaRddRowA = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDA, CSV_DELIM, schemaA);
            JavaRDD<Row> javaRddRowB = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDB, CSV_DELIM, schemaB);
            // Create DataFrame
            StructType dfSchemaA = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaA, false);
            Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, dfSchemaA);
            StructType dfSchemaB = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaB, false);
            Dataset<Row> dataFrameB = spark.createDataFrame(javaRddRowB, dfSchemaB);
            if (script_type == SCRIPT_TYPE.DML)
                script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", dataFrameA, fmA).in("B", dataFrameB, fmB).out("A").out("C");
            else if (script_type == SCRIPT_TYPE.PYDML)
                // DO NOT USE ; at the end of any statment, it throws NPE
                script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", dataFrameA, fmA).in("B", dataFrameB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
        } else {
            if (inputType == IO_TYPE.JAVA_RDD_STR_CSV || inputType == IO_TYPE.JAVA_RDD_STR_IJV) {
                if (script_type == SCRIPT_TYPE.DML)
                    script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", javaRDDA, fmA).in("B", javaRDDB, fmB).out("A").out("C");
                else if (script_type == SCRIPT_TYPE.PYDML)
                    // DO NOT USE ; at the end of any statment, it throws
                    // NPE
                    script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", javaRDDA, fmA).in("B", javaRDDB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
            } else if (inputType == IO_TYPE.RDD_STR_CSV || inputType == IO_TYPE.RDD_STR_IJV) {
                RDD<String> rddA = JavaRDD.toRDD(javaRDDA);
                RDD<String> rddB = JavaRDD.toRDD(javaRDDB);
                if (script_type == SCRIPT_TYPE.DML)
                    script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", rddA, fmA).in("B", rddB, fmB).out("A").out("C");
                else if (script_type == SCRIPT_TYPE.PYDML)
                    // DO NOT USE ; at the end of any statment, it throws
                    // NPE
                    script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", rddA, fmA).in("B", rddB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
            }
        }
    } else {
        // Input type is file
        String fileA = null, fileB = null;
        if (format == FrameFormat.CSV) {
            fileA = baseDirectory + File.separator + "FrameA.csv";
            fileB = baseDirectory + File.separator + "FrameB.csv";
        } else if (format == FrameFormat.IJV) {
            fileA = baseDirectory + File.separator + "FrameA.ijv";
            fileB = baseDirectory + File.separator + "FrameB.ijv";
        }
        if (script_type == SCRIPT_TYPE.DML)
            script = dml("A=read($A); B=read($B);A[2:3,2:4]=B;C=A[2:3,2:3];A[1,1]=234").in("$A", fileA, fmA).in("$B", fileB, fmB).out("A").out("C");
        else if (script_type == SCRIPT_TYPE.PYDML)
            // DO NOT USE ; at the end of any statment, it throws NPE
            script = pydml("A=load($A)\nB=load($B)\nA[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("$A", fileA).in("$B", fileB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
    }
    MLResults mlResults = ml.execute(script);
    // Validate output schema
    List<ValueType> lschemaOutA = Arrays.asList(mlResults.getFrameObject("A").getSchema());
    List<ValueType> lschemaOutC = Arrays.asList(mlResults.getFrameObject("C").getSchema());
    Assert.assertEquals(ValueType.INT, lschemaOutA.get(0));
    Assert.assertEquals(ValueType.STRING, lschemaOutA.get(1));
    Assert.assertEquals(ValueType.DOUBLE, lschemaOutA.get(2));
    Assert.assertEquals(ValueType.BOOLEAN, lschemaOutA.get(3));
    Assert.assertEquals(ValueType.STRING, lschemaOutC.get(0));
    Assert.assertEquals(ValueType.DOUBLE, lschemaOutC.get(1));
    if (outputType == IO_TYPE.JAVA_RDD_STR_CSV) {
        JavaRDD<String> javaRDDStringCSVA = mlResults.getJavaRDDStringCSV("A");
        List<String> linesA = javaRDDStringCSVA.collect();
        Assert.assertEquals("1,Str2,3.0,true", linesA.get(0));
        Assert.assertEquals("4,Str12,13.0,true", linesA.get(1));
        Assert.assertEquals("7,Str25,26.0,false", linesA.get(2));
        JavaRDD<String> javaRDDStringCSVC = mlResults.getJavaRDDStringCSV("C");
        List<String> linesC = javaRDDStringCSVC.collect();
        Assert.assertEquals("Str12,13.0", linesC.get(0));
        Assert.assertEquals("Str25,26.0", linesC.get(1));
    } else if (outputType == IO_TYPE.JAVA_RDD_STR_IJV) {
        JavaRDD<String> javaRDDStringIJVA = mlResults.getJavaRDDStringIJV("A");
        List<String> linesA = javaRDDStringIJVA.collect();
        Assert.assertEquals("1 1 1", linesA.get(0));
        Assert.assertEquals("1 2 Str2", linesA.get(1));
        Assert.assertEquals("1 3 3.0", linesA.get(2));
        Assert.assertEquals("1 4 true", linesA.get(3));
        Assert.assertEquals("2 1 4", linesA.get(4));
        Assert.assertEquals("2 2 Str12", linesA.get(5));
        Assert.assertEquals("2 3 13.0", linesA.get(6));
        Assert.assertEquals("2 4 true", linesA.get(7));
        JavaRDD<String> javaRDDStringIJVC = mlResults.getJavaRDDStringIJV("C");
        List<String> linesC = javaRDDStringIJVC.collect();
        Assert.assertEquals("1 1 Str12", linesC.get(0));
        Assert.assertEquals("1 2 13.0", linesC.get(1));
        Assert.assertEquals("2 1 Str25", linesC.get(2));
        Assert.assertEquals("2 2 26.0", linesC.get(3));
    } else if (outputType == IO_TYPE.RDD_STR_CSV) {
        RDD<String> rddStringCSVA = mlResults.getRDDStringCSV("A");
        Iterator<String> iteratorA = rddStringCSVA.toLocalIterator();
        Assert.assertEquals("1,Str2,3.0,true", iteratorA.next());
        Assert.assertEquals("4,Str12,13.0,true", iteratorA.next());
        Assert.assertEquals("7,Str25,26.0,false", iteratorA.next());
        RDD<String> rddStringCSVC = mlResults.getRDDStringCSV("C");
        Iterator<String> iteratorC = rddStringCSVC.toLocalIterator();
        Assert.assertEquals("Str12,13.0", iteratorC.next());
        Assert.assertEquals("Str25,26.0", iteratorC.next());
    } else if (outputType == IO_TYPE.RDD_STR_IJV) {
        RDD<String> rddStringIJVA = mlResults.getRDDStringIJV("A");
        Iterator<String> iteratorA = rddStringIJVA.toLocalIterator();
        Assert.assertEquals("1 1 1", iteratorA.next());
        Assert.assertEquals("1 2 Str2", iteratorA.next());
        Assert.assertEquals("1 3 3.0", iteratorA.next());
        Assert.assertEquals("1 4 true", iteratorA.next());
        Assert.assertEquals("2 1 4", iteratorA.next());
        Assert.assertEquals("2 2 Str12", iteratorA.next());
        Assert.assertEquals("2 3 13.0", iteratorA.next());
        Assert.assertEquals("2 4 true", iteratorA.next());
        Assert.assertEquals("3 1 7", iteratorA.next());
        Assert.assertEquals("3 2 Str25", iteratorA.next());
        Assert.assertEquals("3 3 26.0", iteratorA.next());
        Assert.assertEquals("3 4 false", iteratorA.next());
        RDD<String> rddStringIJVC = mlResults.getRDDStringIJV("C");
        Iterator<String> iteratorC = rddStringIJVC.toLocalIterator();
        Assert.assertEquals("1 1 Str12", iteratorC.next());
        Assert.assertEquals("1 2 13.0", iteratorC.next());
        Assert.assertEquals("2 1 Str25", iteratorC.next());
        Assert.assertEquals("2 2 26.0", iteratorC.next());
    } else if (outputType == IO_TYPE.DATAFRAME) {
        Dataset<Row> dataFrameA = mlResults.getDataFrame("A").drop(RDDConverterUtils.DF_ID_COLUMN);
        StructType dfschemaA = dataFrameA.schema();
        StructField structTypeA = dfschemaA.apply(0);
        Assert.assertEquals(DataTypes.LongType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(1);
        Assert.assertEquals(DataTypes.StringType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(2);
        Assert.assertEquals(DataTypes.DoubleType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(3);
        Assert.assertEquals(DataTypes.BooleanType, structTypeA.dataType());
        List<Row> listAOut = dataFrameA.collectAsList();
        Row row1 = listAOut.get(0);
        Assert.assertEquals("Mismatch with expected value", Long.valueOf(1), row1.get(0));
        Assert.assertEquals("Mismatch with expected value", "Str2", row1.get(1));
        Assert.assertEquals("Mismatch with expected value", 3.0, row1.get(2));
        Assert.assertEquals("Mismatch with expected value", true, row1.get(3));
        Row row2 = listAOut.get(1);
        Assert.assertEquals("Mismatch with expected value", Long.valueOf(4), row2.get(0));
        Assert.assertEquals("Mismatch with expected value", "Str12", row2.get(1));
        Assert.assertEquals("Mismatch with expected value", 13.0, row2.get(2));
        Assert.assertEquals("Mismatch with expected value", true, row2.get(3));
        Dataset<Row> dataFrameC = mlResults.getDataFrame("C").drop(RDDConverterUtils.DF_ID_COLUMN);
        StructType dfschemaC = dataFrameC.schema();
        StructField structTypeC = dfschemaC.apply(0);
        Assert.assertEquals(DataTypes.StringType, structTypeC.dataType());
        structTypeC = dfschemaC.apply(1);
        Assert.assertEquals(DataTypes.DoubleType, structTypeC.dataType());
        List<Row> listCOut = dataFrameC.collectAsList();
        Row row3 = listCOut.get(0);
        Assert.assertEquals("Mismatch with expected value", "Str12", row3.get(0));
        Assert.assertEquals("Mismatch with expected value", 13.0, row3.get(1));
        Row row4 = listCOut.get(1);
        Assert.assertEquals("Mismatch with expected value", "Str25", row4.get(0));
        Assert.assertEquals("Mismatch with expected value", 26.0, row4.get(1));
    } else {
        String[][] frameA = mlResults.getFrameAs2DStringArray("A");
        Assert.assertEquals("Str2", frameA[0][1]);
        Assert.assertEquals("3.0", frameA[0][2]);
        Assert.assertEquals("13.0", frameA[1][2]);
        Assert.assertEquals("true", frameA[1][3]);
        Assert.assertEquals("Str25", frameA[2][1]);
        String[][] frameC = mlResults.getFrameAs2DStringArray("C");
        Assert.assertEquals("Str12", frameC[0][0]);
        Assert.assertEquals("Str25", frameC[1][0]);
        Assert.assertEquals("13.0", frameC[0][1]);
        Assert.assertEquals("26.0", frameC[1][1]);
    }
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MLResults(org.apache.sysml.api.mlcontext.MLResults) ArrayList(java.util.ArrayList) FrameSchema(org.apache.sysml.api.mlcontext.FrameSchema) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) RDD(org.apache.spark.rdd.RDD) StructField(org.apache.spark.sql.types.StructField) Iterator(scala.collection.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) Row(org.apache.spark.sql.Row) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata)

Example 5 with Iterator

use of scala.collection.Iterator in project flink by apache.

the class PythonCorrelateSplitRule method createNewFieldNames.

private List<String> createNewFieldNames(RelDataType rowType, RexBuilder rexBuilder, int primitiveFieldCount, ArrayBuffer<RexNode> extractedRexNodes, List<RexNode> calcProjects) {
    for (int i = 0; i < primitiveFieldCount; i++) {
        calcProjects.add(RexInputRef.of(i, rowType));
    }
    // change RexCorrelVariable to RexInputRef.
    RexDefaultVisitor<RexNode> visitor = new RexDefaultVisitor<RexNode>() {

        @Override
        public RexNode visitFieldAccess(RexFieldAccess fieldAccess) {
            RexNode expr = fieldAccess.getReferenceExpr();
            if (expr instanceof RexCorrelVariable) {
                RelDataTypeField field = fieldAccess.getField();
                return new RexInputRef(field.getIndex(), field.getType());
            } else {
                return rexBuilder.makeFieldAccess(expr.accept(this), fieldAccess.getField().getIndex());
            }
        }

        @Override
        public RexNode visitNode(RexNode rexNode) {
            return rexNode;
        }
    };
    // add the fields of the extracted rex calls.
    Iterator<RexNode> iterator = extractedRexNodes.iterator();
    while (iterator.hasNext()) {
        RexNode rexNode = iterator.next();
        if (rexNode instanceof RexCall) {
            RexCall rexCall = (RexCall) rexNode;
            List<RexNode> newProjects = rexCall.getOperands().stream().map(x -> x.accept(visitor)).collect(Collectors.toList());
            RexCall newRexCall = rexCall.clone(rexCall.getType(), newProjects);
            calcProjects.add(newRexCall);
        } else {
            calcProjects.add(rexNode);
        }
    }
    List<String> nameList = new LinkedList<>();
    for (int i = 0; i < primitiveFieldCount; i++) {
        nameList.add(rowType.getFieldNames().get(i));
    }
    Iterator<Object> indicesIterator = extractedRexNodes.indices().iterator();
    while (indicesIterator.hasNext()) {
        nameList.add("f" + indicesIterator.next());
    }
    return SqlValidatorUtil.uniquify(nameList, rexBuilder.getTypeFactory().getTypeSystem().isSchemaCaseSensitive());
}
Also used : RexFieldAccess(org.apache.calcite.rex.RexFieldAccess) RexProgram(org.apache.calcite.rex.RexProgram) RexUtil(org.apache.calcite.rex.RexUtil) SqlValidatorUtil(org.apache.calcite.sql.validate.SqlValidatorUtil) RexNode(org.apache.calcite.rex.RexNode) LinkedList(java.util.LinkedList) ArrayBuffer(scala.collection.mutable.ArrayBuffer) PythonUtil(org.apache.flink.table.planner.plan.utils.PythonUtil) RelDataType(org.apache.calcite.rel.type.RelDataType) RexDefaultVisitor(org.apache.flink.table.planner.plan.utils.RexDefaultVisitor) RexBuilder(org.apache.calcite.rex.RexBuilder) Iterator(scala.collection.Iterator) FlinkLogicalTableFunctionScan(org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalTableFunctionScan) FlinkLogicalCalc(org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalCalc) RelNode(org.apache.calcite.rel.RelNode) Collectors(java.util.stream.Collectors) RelOptRuleCall(org.apache.calcite.plan.RelOptRuleCall) RexInputRef(org.apache.calcite.rex.RexInputRef) RelOptRule(org.apache.calcite.plan.RelOptRule) RexProgramBuilder(org.apache.calcite.rex.RexProgramBuilder) List(java.util.List) StreamPhysicalCorrelateRule(org.apache.flink.table.planner.plan.rules.physical.stream.StreamPhysicalCorrelateRule) RelDataTypeField(org.apache.calcite.rel.type.RelDataTypeField) RexCorrelVariable(org.apache.calcite.rex.RexCorrelVariable) HepRelVertex(org.apache.calcite.plan.hep.HepRelVertex) FlinkLogicalCorrelate(org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalCorrelate) RexCall(org.apache.calcite.rex.RexCall) RexCorrelVariable(org.apache.calcite.rex.RexCorrelVariable) RexDefaultVisitor(org.apache.flink.table.planner.plan.utils.RexDefaultVisitor) LinkedList(java.util.LinkedList) RexCall(org.apache.calcite.rex.RexCall) RelDataTypeField(org.apache.calcite.rel.type.RelDataTypeField) RexInputRef(org.apache.calcite.rex.RexInputRef) RexFieldAccess(org.apache.calcite.rex.RexFieldAccess) RexNode(org.apache.calcite.rex.RexNode)

Aggregations

Iterator (scala.collection.Iterator)5 ArrayList (java.util.ArrayList)3 List (java.util.List)3 JavaRDD (org.apache.spark.api.java.JavaRDD)2 RDD (org.apache.spark.rdd.RDD)2 Row (org.apache.spark.sql.Row)2 StructField (org.apache.spark.sql.types.StructField)2 StructType (org.apache.spark.sql.types.StructType)2 FrameMetadata (org.apache.sysml.api.mlcontext.FrameMetadata)2 FrameSchema (org.apache.sysml.api.mlcontext.FrameSchema)2 MLResults (org.apache.sysml.api.mlcontext.MLResults)2 Script (org.apache.sysml.api.mlcontext.Script)2 ValueType (org.apache.sysml.parser.Expression.ValueType)2 CommaSeparatedValueStringToDoubleArrayRow (org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow)2 Tuple2 (scala.Tuple2)2 TestingGcMonitor (com.facebook.airlift.stats.TestingGcMonitor)1 Session (com.facebook.presto.Session)1 StageExecutionId (com.facebook.presto.execution.StageExecutionId)1 StageId (com.facebook.presto.execution.StageId)1 TaskId (com.facebook.presto.execution.TaskId)1