Search in sources :

Example 16 with Seq

use of scala.collection.Seq in project systemml by apache.

the class MLContextTest method testInputTupleSeqNoMetadataDML.

@SuppressWarnings({ "rawtypes", "unchecked" })
@Test
public void testInputTupleSeqNoMetadataDML() {
    System.out.println("MLContextTest - Tuple sequence no metadata DML");
    List<String> list1 = new ArrayList<String>();
    list1.add("1,2");
    list1.add("3,4");
    JavaRDD<String> javaRDD1 = sc.parallelize(list1);
    RDD<String> rdd1 = JavaRDD.toRDD(javaRDD1);
    List<String> list2 = new ArrayList<String>();
    list2.add("5,6");
    list2.add("7,8");
    JavaRDD<String> javaRDD2 = sc.parallelize(list2);
    RDD<String> rdd2 = JavaRDD.toRDD(javaRDD2);
    Tuple2 tuple1 = new Tuple2("m1", rdd1);
    Tuple2 tuple2 = new Tuple2("m2", rdd2);
    List tupleList = new ArrayList();
    tupleList.add(tuple1);
    tupleList.add(tuple2);
    Seq seq = JavaConversions.asScalaBuffer(tupleList).toSeq();
    Script script = dml("print('sums: ' + sum(m1) + ' ' + sum(m2));").in(seq);
    setExpectedStdOut("sums: 10.0 26.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) Seq(scala.collection.Seq) Test(org.junit.Test)

Example 17 with Seq

use of scala.collection.Seq in project Gaffer by gchq.

the class SortFullGroup method call.

@Override
public OperationException call() throws IOException {
    final String firstSortColumn = sortColumns.get(0);
    final List<String> otherSortColumns = sortColumns.subList(1, sortColumns.size());
    final List<String> inputFilesThatExist = new ArrayList<>();
    for (final String file : inputFiles) {
        if (!fs.exists(new Path(file))) {
            LOGGER.info("Ignoring file {} as it does not exist", file);
        } else {
            inputFilesThatExist.add(file);
        }
    }
    if (inputFilesThatExist.isEmpty()) {
        LOGGER.info("Not sorting data for group {} as list of input files that exist is empty", group);
        return null;
    }
    // Partition by core columns (e.g. source, destination, directed for an edge) and then sort within partitions
    // by core columns and group-by columns. This ensures that all data about an edge ends up in one partition
    // but within that partition it is sorted by the core columns and the group-by columns. If we just sort by
    // the core and group-by columns then we can have the same edge split across multiple partitions (which
    // breaks our partitioning approach and would make it difficult to do query-time aggregation).
    LOGGER.info("Sorting data in {} files by columns {} to {} files in output directory {}", inputFilesThatExist.size(), StringUtils.join(sortColumns, ','), numberOfOutputFiles, outputDir);
    // NB: Don't want to include group-by columns as need to partition by core properties only (e.g. source, destination, directed)
    final ExtractKeyFromRow extractKeyFromRow = new ExtractKeyFromRow(new HashSet<>(), schemaUtils.getColumnToPaths(group), schemaUtils.getEntityGroups().contains(group), isReversed);
    LOGGER.info("Sampling data from {} input files to identify split points for sorting", inputFilesThatExist.size());
    final List<Seq<Object>> rows = spark.read().parquet(inputFilesThatExist.toArray(new String[] {})).javaRDD().map(extractKeyFromRow).takeSample(false, 10000, 1234567890L);
    LOGGER.info("Obtained {} rows in the sample", rows.size());
    final TreeSet<Seq<Object>> sortedRows = new TreeSet<>(new SeqComparator());
    sortedRows.addAll(rows);
    final TreeSet<Seq<Object>> splitPoints = new TreeSet<>(new SeqComparator());
    int desiredNumberOfSplits = numberOfOutputFiles - 1;
    long outputEveryNthRecord;
    if (sortedRows.size() < 2 || desiredNumberOfSplits < 1) {
        outputEveryNthRecord = 1;
    } else {
        outputEveryNthRecord = sortedRows.size() / desiredNumberOfSplits;
    }
    if (outputEveryNthRecord < 1) {
        outputEveryNthRecord = 1;
    }
    int numberOfSplitsOutput = 0;
    int count = 0;
    for (final Seq<Object> seq : sortedRows) {
        count++;
        if (0 == count % outputEveryNthRecord) {
            splitPoints.add(seq);
            numberOfSplitsOutput++;
        }
        if (numberOfSplitsOutput >= desiredNumberOfSplits) {
            break;
        }
    }
    LOGGER.info("Found {} split points", splitPoints.size());
    final SeqObjectPartitioner partitioner = new SeqObjectPartitioner(numberOfOutputFiles, splitPoints);
    LOGGER.info("Partitioning data using split points and sorting within partition, outputting to {}", outputDir);
    final JavaRDD<Row> partitionedData = spark.read().parquet(inputFilesThatExist.toArray(new String[] {})).javaRDD().keyBy(new ExtractKeyFromRow(new HashSet<>(), schemaUtils.getColumnToPaths(group), schemaUtils.getEntityGroups().contains(group), isReversed)).partitionBy(partitioner).values();
    LOGGER.info("Sorting data within partitions, outputting to {}", outputDir);
    spark.createDataFrame(partitionedData, schemaUtils.getSparkSchema(group)).sortWithinPartitions(firstSortColumn, otherSortColumns.stream().toArray(String[]::new)).write().option("compression", compressionCodecName.name()).parquet(outputDir);
    final FileStatus[] sortedFiles = fs.listStatus(new Path(outputDir), path -> path.getName().endsWith(".parquet"));
    final SortedSet<Path> sortedSortedFiles = new TreeSet<>();
    Arrays.stream(sortedFiles).map(FileStatus::getPath).forEach(sortedSortedFiles::add);
    final Path[] sortedSortedPaths = sortedSortedFiles.toArray(new Path[] {});
    // Rename files, e.g. part-00000-*** to partition-0, removing empty files and adapting numbers accordingly
    LOGGER.info("Renaming part-* files to partition-* files, removing empty files (part-* files are in directory {})", outputDir);
    int counter = 0;
    for (int i = 0; i < sortedSortedPaths.length; i++) {
        final Path path = sortedSortedPaths[i];
        final boolean isEmpty = isFileEmpty(path);
        if (isEmpty) {
            LOGGER.debug("Deleting empty file {}", path);
            fs.delete(path, false);
        } else {
            final Path newPath = new Path(outputDir + ParquetStore.getFile(counter));
            LOGGER.debug("Renaming {} to {}", path, newPath);
            fs.rename(path, newPath);
            // NB This automatically renames the .crc file as well
            counter++;
        }
    }
    return null;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) Row(org.apache.spark.sql.Row) Seq(scala.collection.Seq)

Example 18 with Seq

use of scala.collection.Seq in project kafka by apache.

the class TopicBasedRemoteLogMetadataManagerRestartTest method testRLMMAPIsAfterRestart.

@Test
public void testRLMMAPIsAfterRestart() throws Exception {
    // Create topics.
    String leaderTopic = "new-leader";
    HashMap<Object, Seq<Object>> assignedLeaderTopicReplicas = new HashMap<>();
    List<Object> leaderTopicReplicas = new ArrayList<>();
    // Set broker id 0 as the first entry which is taken as the leader.
    leaderTopicReplicas.add(0);
    leaderTopicReplicas.add(1);
    leaderTopicReplicas.add(2);
    assignedLeaderTopicReplicas.put(0, JavaConverters.asScalaBuffer(leaderTopicReplicas));
    remoteLogMetadataManagerHarness.createTopicWithAssignment(leaderTopic, JavaConverters.mapAsScalaMap(assignedLeaderTopicReplicas), remoteLogMetadataManagerHarness.listenerName());
    String followerTopic = "new-follower";
    HashMap<Object, Seq<Object>> assignedFollowerTopicReplicas = new HashMap<>();
    List<Object> followerTopicReplicas = new ArrayList<>();
    // Set broker id 1 as the first entry which is taken as the leader.
    followerTopicReplicas.add(1);
    followerTopicReplicas.add(2);
    followerTopicReplicas.add(0);
    assignedFollowerTopicReplicas.put(0, JavaConverters.asScalaBuffer(followerTopicReplicas));
    remoteLogMetadataManagerHarness.createTopicWithAssignment(followerTopic, JavaConverters.mapAsScalaMap(assignedFollowerTopicReplicas), remoteLogMetadataManagerHarness.listenerName());
    final TopicIdPartition leaderTopicIdPartition = new TopicIdPartition(Uuid.randomUuid(), new TopicPartition(leaderTopic, 0));
    final TopicIdPartition followerTopicIdPartition = new TopicIdPartition(Uuid.randomUuid(), new TopicPartition(followerTopic, 0));
    // Register these partitions to RLMM.
    topicBasedRlmm().onPartitionLeadershipChanges(Collections.singleton(leaderTopicIdPartition), Collections.singleton(followerTopicIdPartition));
    // Add segments for these partitions but they are not available as they have not yet been subscribed.
    RemoteLogSegmentMetadata leaderSegmentMetadata = new RemoteLogSegmentMetadata(new RemoteLogSegmentId(leaderTopicIdPartition, Uuid.randomUuid()), 0, 100, -1L, 0, time.milliseconds(), SEG_SIZE, Collections.singletonMap(0, 0L));
    topicBasedRlmm().addRemoteLogSegmentMetadata(leaderSegmentMetadata).get();
    RemoteLogSegmentMetadata followerSegmentMetadata = new RemoteLogSegmentMetadata(new RemoteLogSegmentId(followerTopicIdPartition, Uuid.randomUuid()), 0, 100, -1L, 0, time.milliseconds(), SEG_SIZE, Collections.singletonMap(0, 0L));
    topicBasedRlmm().addRemoteLogSegmentMetadata(followerSegmentMetadata).get();
    // Stop TopicBasedRemoteLogMetadataManager only.
    stopTopicBasedRemoteLogMetadataManagerHarness();
    // Start TopicBasedRemoteLogMetadataManager but do not start consumer thread to check whether the stored metadata is
    // loaded successfully or not.
    startTopicBasedRemoteLogMetadataManagerHarness(false);
    // Register these partitions to RLMM, which loads the respective metadata snapshots.
    topicBasedRlmm().onPartitionLeadershipChanges(Collections.singleton(leaderTopicIdPartition), Collections.singleton(followerTopicIdPartition));
    // Check for the stored entries from the earlier run.
    Assertions.assertTrue(TestUtils.sameElementsWithoutOrder(Collections.singleton(leaderSegmentMetadata).iterator(), topicBasedRlmm().listRemoteLogSegments(leaderTopicIdPartition)));
    Assertions.assertTrue(TestUtils.sameElementsWithoutOrder(Collections.singleton(followerSegmentMetadata).iterator(), topicBasedRlmm().listRemoteLogSegments(followerTopicIdPartition)));
    // Check whether the check-pointed consumer offsets are stored or not.
    Path committedOffsetsPath = new File(logDir, COMMITTED_OFFSETS_FILE_NAME).toPath();
    Assertions.assertTrue(committedOffsetsPath.toFile().exists());
    CommittedOffsetsFile committedOffsetsFile = new CommittedOffsetsFile(committedOffsetsPath.toFile());
    int metadataPartition1 = topicBasedRlmm().metadataPartition(leaderTopicIdPartition);
    int metadataPartition2 = topicBasedRlmm().metadataPartition(followerTopicIdPartition);
    Optional<Long> receivedOffsetForPartition1 = topicBasedRlmm().receivedOffsetForPartition(metadataPartition1);
    Optional<Long> receivedOffsetForPartition2 = topicBasedRlmm().receivedOffsetForPartition(metadataPartition2);
    Assertions.assertTrue(receivedOffsetForPartition1.isPresent());
    Assertions.assertTrue(receivedOffsetForPartition2.isPresent());
    // Make sure these offsets are at least 0.
    Assertions.assertTrue(receivedOffsetForPartition1.get() >= 0);
    Assertions.assertTrue(receivedOffsetForPartition2.get() >= 0);
    // Check the stored entries and the offsets that were set on consumer are the same.
    Map<Integer, Long> partitionToOffset = committedOffsetsFile.readEntries();
    Assertions.assertEquals(partitionToOffset.get(metadataPartition1), receivedOffsetForPartition1.get());
    Assertions.assertEquals(partitionToOffset.get(metadataPartition2), receivedOffsetForPartition2.get());
    // Start Consumer thread
    topicBasedRlmm().startConsumerThread();
    // Add one more segment
    RemoteLogSegmentMetadata leaderSegmentMetadata2 = new RemoteLogSegmentMetadata(new RemoteLogSegmentId(leaderTopicIdPartition, Uuid.randomUuid()), 101, 200, -1L, 0, time.milliseconds(), SEG_SIZE, Collections.singletonMap(0, 101L));
    topicBasedRlmm().addRemoteLogSegmentMetadata(leaderSegmentMetadata2).get();
    // Check that both the stored segment and recently added segment are available.
    Assertions.assertTrue(TestUtils.sameElementsWithoutOrder(Arrays.asList(leaderSegmentMetadata, leaderSegmentMetadata2).iterator(), topicBasedRlmm().listRemoteLogSegments(leaderTopicIdPartition)));
}
Also used : Path(java.nio.file.Path) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TopicIdPartition(org.apache.kafka.common.TopicIdPartition) TopicPartition(org.apache.kafka.common.TopicPartition) RemoteLogSegmentId(org.apache.kafka.server.log.remote.storage.RemoteLogSegmentId) File(java.io.File) Seq(scala.collection.Seq) RemoteLogSegmentMetadata(org.apache.kafka.server.log.remote.storage.RemoteLogSegmentMetadata) Test(org.junit.jupiter.api.Test)

Example 19 with Seq

use of scala.collection.Seq in project flink by apache.

the class BatchPhysicalPythonAggregateRule method convert.

@Override
public RelNode convert(RelNode relNode) {
    FlinkLogicalAggregate agg = (FlinkLogicalAggregate) relNode;
    RelNode input = agg.getInput();
    int[] groupSet = agg.getGroupSet().toArray();
    RelTraitSet traitSet = relNode.getTraitSet().replace(FlinkConventions.BATCH_PHYSICAL());
    Tuple2<int[], Seq<AggregateCall>> auxGroupSetAndCallsTuple = AggregateUtil.checkAndSplitAggCalls(agg);
    int[] auxGroupSet = auxGroupSetAndCallsTuple._1;
    Seq<AggregateCall> aggCallsWithoutAuxGroupCalls = auxGroupSetAndCallsTuple._2;
    Tuple3<int[][], DataType[][], UserDefinedFunction[]> aggBufferTypesAndFunctions = AggregateUtil.transformToBatchAggregateFunctions(FlinkTypeFactory.toLogicalRowType(input.getRowType()), aggCallsWithoutAuxGroupCalls, null);
    UserDefinedFunction[] aggFunctions = aggBufferTypesAndFunctions._3();
    RelTraitSet requiredTraitSet = input.getTraitSet().replace(FlinkConventions.BATCH_PHYSICAL());
    if (groupSet.length != 0) {
        FlinkRelDistribution requiredDistribution = FlinkRelDistribution.hash(groupSet, false);
        requiredTraitSet = requiredTraitSet.replace(requiredDistribution);
        RelCollation sortCollation = createRelCollation(groupSet);
        requiredTraitSet = requiredTraitSet.replace(sortCollation);
    } else {
        requiredTraitSet = requiredTraitSet.replace(FlinkRelDistribution.SINGLETON());
    }
    RelNode convInput = RelOptRule.convert(input, requiredTraitSet);
    return new BatchPhysicalPythonGroupAggregate(relNode.getCluster(), traitSet, convInput, agg.getRowType(), convInput.getRowType(), convInput.getRowType(), groupSet, auxGroupSet, aggCallsWithoutAuxGroupCalls, aggFunctions);
}
Also used : UserDefinedFunction(org.apache.flink.table.functions.UserDefinedFunction) RelTraitSet(org.apache.calcite.plan.RelTraitSet) AggregateCall(org.apache.calcite.rel.core.AggregateCall) FlinkRelDistribution(org.apache.flink.table.planner.plan.trait.FlinkRelDistribution) RelCollation(org.apache.calcite.rel.RelCollation) RelNode(org.apache.calcite.rel.RelNode) BatchPhysicalPythonGroupAggregate(org.apache.flink.table.planner.plan.nodes.physical.batch.BatchPhysicalPythonGroupAggregate) FlinkLogicalAggregate(org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalAggregate) DataType(org.apache.flink.table.types.DataType) Seq(scala.collection.Seq)

Example 20 with Seq

use of scala.collection.Seq in project flink by apache.

the class BatchPhysicalPythonWindowAggregateRule method onMatch.

@Override
public void onMatch(RelOptRuleCall call) {
    FlinkLogicalWindowAggregate agg = call.rel(0);
    RelNode input = agg.getInput();
    LogicalWindow window = agg.getWindow();
    if (!(window instanceof TumblingGroupWindow && AggregateUtil.hasTimeIntervalType(((TumblingGroupWindow) window).size()) || window instanceof SlidingGroupWindow && AggregateUtil.hasTimeIntervalType(((SlidingGroupWindow) window).size()) || window instanceof SessionGroupWindow)) {
        // sliding & tumbling count window and session window not supported
        throw new TableException("Window " + window + " is not supported right now.");
    }
    int[] groupSet = agg.getGroupSet().toArray();
    RelTraitSet traitSet = agg.getTraitSet().replace(FlinkConventions.BATCH_PHYSICAL());
    Tuple2<int[], Seq<AggregateCall>> auxGroupSetAndCallsTuple = AggregateUtil.checkAndSplitAggCalls(agg);
    int[] auxGroupSet = auxGroupSetAndCallsTuple._1;
    Seq<AggregateCall> aggCallsWithoutAuxGroupCalls = auxGroupSetAndCallsTuple._2;
    Tuple3<int[][], DataType[][], UserDefinedFunction[]> aggBufferTypesAndFunctions = AggregateUtil.transformToBatchAggregateFunctions(FlinkTypeFactory.toLogicalRowType(input.getRowType()), aggCallsWithoutAuxGroupCalls, null);
    UserDefinedFunction[] aggFunctions = aggBufferTypesAndFunctions._3();
    int inputTimeFieldIndex = AggregateUtil.timeFieldIndex(input.getRowType(), call.builder(), window.timeAttribute());
    RelDataType inputTimeFieldType = input.getRowType().getFieldList().get(inputTimeFieldIndex).getType();
    boolean inputTimeIsDate = inputTimeFieldType.getSqlTypeName() == SqlTypeName.DATE;
    RelTraitSet requiredTraitSet = agg.getTraitSet().replace(FlinkConventions.BATCH_PHYSICAL());
    if (groupSet.length != 0) {
        FlinkRelDistribution requiredDistribution = FlinkRelDistribution.hash(groupSet, false);
        requiredTraitSet = requiredTraitSet.replace(requiredDistribution);
    } else {
        requiredTraitSet = requiredTraitSet.replace(FlinkRelDistribution.SINGLETON());
    }
    RelCollation sortCollation = createRelCollation(groupSet, inputTimeFieldIndex);
    requiredTraitSet = requiredTraitSet.replace(sortCollation);
    RelNode newInput = RelOptRule.convert(input, requiredTraitSet);
    BatchPhysicalPythonGroupWindowAggregate windowAgg = new BatchPhysicalPythonGroupWindowAggregate(agg.getCluster(), traitSet, newInput, agg.getRowType(), newInput.getRowType(), groupSet, auxGroupSet, aggCallsWithoutAuxGroupCalls, aggFunctions, window, inputTimeFieldIndex, inputTimeIsDate, agg.getNamedProperties());
    call.transformTo(windowAgg);
}
Also used : TableException(org.apache.flink.table.api.TableException) UserDefinedFunction(org.apache.flink.table.functions.UserDefinedFunction) BatchPhysicalPythonGroupWindowAggregate(org.apache.flink.table.planner.plan.nodes.physical.batch.BatchPhysicalPythonGroupWindowAggregate) RelDataType(org.apache.calcite.rel.type.RelDataType) RelTraitSet(org.apache.calcite.plan.RelTraitSet) AggregateCall(org.apache.calcite.rel.core.AggregateCall) FlinkRelDistribution(org.apache.flink.table.planner.plan.trait.FlinkRelDistribution) RelCollation(org.apache.calcite.rel.RelCollation) LogicalWindow(org.apache.flink.table.planner.plan.logical.LogicalWindow) TumblingGroupWindow(org.apache.flink.table.planner.plan.logical.TumblingGroupWindow) RelNode(org.apache.calcite.rel.RelNode) FlinkLogicalWindowAggregate(org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalWindowAggregate) DataType(org.apache.flink.table.types.DataType) RelDataType(org.apache.calcite.rel.type.RelDataType) SlidingGroupWindow(org.apache.flink.table.planner.plan.logical.SlidingGroupWindow) SessionGroupWindow(org.apache.flink.table.planner.plan.logical.SessionGroupWindow) Seq(scala.collection.Seq)

Aggregations

Seq (scala.collection.Seq)20 ArrayList (java.util.ArrayList)18 List (java.util.List)14 Script (org.apache.sysml.api.mlcontext.Script)12 Test (org.junit.Test)12 Tuple2 (scala.Tuple2)6 Tuple3 (scala.Tuple3)5 MLResults (org.apache.sysml.api.mlcontext.MLResults)4 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)4 AggregateCall (org.apache.calcite.rel.core.AggregateCall)3 HashMap (java.util.HashMap)2 RelTraitSet (org.apache.calcite.plan.RelTraitSet)2 RelCollation (org.apache.calcite.rel.RelCollation)2 RelNode (org.apache.calcite.rel.RelNode)2 RexInputRef (org.apache.calcite.rex.RexInputRef)2 RexNode (org.apache.calcite.rex.RexNode)2 UserDefinedFunction (org.apache.flink.table.functions.UserDefinedFunction)2 FlinkRelDistribution (org.apache.flink.table.planner.plan.trait.FlinkRelDistribution)2 DataType (org.apache.flink.table.types.DataType)2 Option (scala.Option)2