use of scala.collection.Seq in project systemml by apache.
the class MLContextTest method testInputTupleSeqNoMetadataDML.
@SuppressWarnings({ "rawtypes", "unchecked" })
@Test
public void testInputTupleSeqNoMetadataDML() {
System.out.println("MLContextTest - Tuple sequence no metadata DML");
List<String> list1 = new ArrayList<String>();
list1.add("1,2");
list1.add("3,4");
JavaRDD<String> javaRDD1 = sc.parallelize(list1);
RDD<String> rdd1 = JavaRDD.toRDD(javaRDD1);
List<String> list2 = new ArrayList<String>();
list2.add("5,6");
list2.add("7,8");
JavaRDD<String> javaRDD2 = sc.parallelize(list2);
RDD<String> rdd2 = JavaRDD.toRDD(javaRDD2);
Tuple2 tuple1 = new Tuple2("m1", rdd1);
Tuple2 tuple2 = new Tuple2("m2", rdd2);
List tupleList = new ArrayList();
tupleList.add(tuple1);
tupleList.add(tuple2);
Seq seq = JavaConversions.asScalaBuffer(tupleList).toSeq();
Script script = dml("print('sums: ' + sum(m1) + ' ' + sum(m2));").in(seq);
setExpectedStdOut("sums: 10.0 26.0");
ml.execute(script);
}
use of scala.collection.Seq in project Gaffer by gchq.
the class SortFullGroup method call.
@Override
public OperationException call() throws IOException {
final String firstSortColumn = sortColumns.get(0);
final List<String> otherSortColumns = sortColumns.subList(1, sortColumns.size());
final List<String> inputFilesThatExist = new ArrayList<>();
for (final String file : inputFiles) {
if (!fs.exists(new Path(file))) {
LOGGER.info("Ignoring file {} as it does not exist", file);
} else {
inputFilesThatExist.add(file);
}
}
if (inputFilesThatExist.isEmpty()) {
LOGGER.info("Not sorting data for group {} as list of input files that exist is empty", group);
return null;
}
// Partition by core columns (e.g. source, destination, directed for an edge) and then sort within partitions
// by core columns and group-by columns. This ensures that all data about an edge ends up in one partition
// but within that partition it is sorted by the core columns and the group-by columns. If we just sort by
// the core and group-by columns then we can have the same edge split across multiple partitions (which
// breaks our partitioning approach and would make it difficult to do query-time aggregation).
LOGGER.info("Sorting data in {} files by columns {} to {} files in output directory {}", inputFilesThatExist.size(), StringUtils.join(sortColumns, ','), numberOfOutputFiles, outputDir);
// NB: Don't want to include group-by columns as need to partition by core properties only (e.g. source, destination, directed)
final ExtractKeyFromRow extractKeyFromRow = new ExtractKeyFromRow(new HashSet<>(), schemaUtils.getColumnToPaths(group), schemaUtils.getEntityGroups().contains(group), isReversed);
LOGGER.info("Sampling data from {} input files to identify split points for sorting", inputFilesThatExist.size());
final List<Seq<Object>> rows = spark.read().parquet(inputFilesThatExist.toArray(new String[] {})).javaRDD().map(extractKeyFromRow).takeSample(false, 10000, 1234567890L);
LOGGER.info("Obtained {} rows in the sample", rows.size());
final TreeSet<Seq<Object>> sortedRows = new TreeSet<>(new SeqComparator());
sortedRows.addAll(rows);
final TreeSet<Seq<Object>> splitPoints = new TreeSet<>(new SeqComparator());
int desiredNumberOfSplits = numberOfOutputFiles - 1;
long outputEveryNthRecord;
if (sortedRows.size() < 2 || desiredNumberOfSplits < 1) {
outputEveryNthRecord = 1;
} else {
outputEveryNthRecord = sortedRows.size() / desiredNumberOfSplits;
}
if (outputEveryNthRecord < 1) {
outputEveryNthRecord = 1;
}
int numberOfSplitsOutput = 0;
int count = 0;
for (final Seq<Object> seq : sortedRows) {
count++;
if (0 == count % outputEveryNthRecord) {
splitPoints.add(seq);
numberOfSplitsOutput++;
}
if (numberOfSplitsOutput >= desiredNumberOfSplits) {
break;
}
}
LOGGER.info("Found {} split points", splitPoints.size());
final SeqObjectPartitioner partitioner = new SeqObjectPartitioner(numberOfOutputFiles, splitPoints);
LOGGER.info("Partitioning data using split points and sorting within partition, outputting to {}", outputDir);
final JavaRDD<Row> partitionedData = spark.read().parquet(inputFilesThatExist.toArray(new String[] {})).javaRDD().keyBy(new ExtractKeyFromRow(new HashSet<>(), schemaUtils.getColumnToPaths(group), schemaUtils.getEntityGroups().contains(group), isReversed)).partitionBy(partitioner).values();
LOGGER.info("Sorting data within partitions, outputting to {}", outputDir);
spark.createDataFrame(partitionedData, schemaUtils.getSparkSchema(group)).sortWithinPartitions(firstSortColumn, otherSortColumns.stream().toArray(String[]::new)).write().option("compression", compressionCodecName.name()).parquet(outputDir);
final FileStatus[] sortedFiles = fs.listStatus(new Path(outputDir), path -> path.getName().endsWith(".parquet"));
final SortedSet<Path> sortedSortedFiles = new TreeSet<>();
Arrays.stream(sortedFiles).map(FileStatus::getPath).forEach(sortedSortedFiles::add);
final Path[] sortedSortedPaths = sortedSortedFiles.toArray(new Path[] {});
// Rename files, e.g. part-00000-*** to partition-0, removing empty files and adapting numbers accordingly
LOGGER.info("Renaming part-* files to partition-* files, removing empty files (part-* files are in directory {})", outputDir);
int counter = 0;
for (int i = 0; i < sortedSortedPaths.length; i++) {
final Path path = sortedSortedPaths[i];
final boolean isEmpty = isFileEmpty(path);
if (isEmpty) {
LOGGER.debug("Deleting empty file {}", path);
fs.delete(path, false);
} else {
final Path newPath = new Path(outputDir + ParquetStore.getFile(counter));
LOGGER.debug("Renaming {} to {}", path, newPath);
fs.rename(path, newPath);
// NB This automatically renames the .crc file as well
counter++;
}
}
return null;
}
use of scala.collection.Seq in project kafka by apache.
the class TopicBasedRemoteLogMetadataManagerRestartTest method testRLMMAPIsAfterRestart.
@Test
public void testRLMMAPIsAfterRestart() throws Exception {
// Create topics.
String leaderTopic = "new-leader";
HashMap<Object, Seq<Object>> assignedLeaderTopicReplicas = new HashMap<>();
List<Object> leaderTopicReplicas = new ArrayList<>();
// Set broker id 0 as the first entry which is taken as the leader.
leaderTopicReplicas.add(0);
leaderTopicReplicas.add(1);
leaderTopicReplicas.add(2);
assignedLeaderTopicReplicas.put(0, JavaConverters.asScalaBuffer(leaderTopicReplicas));
remoteLogMetadataManagerHarness.createTopicWithAssignment(leaderTopic, JavaConverters.mapAsScalaMap(assignedLeaderTopicReplicas), remoteLogMetadataManagerHarness.listenerName());
String followerTopic = "new-follower";
HashMap<Object, Seq<Object>> assignedFollowerTopicReplicas = new HashMap<>();
List<Object> followerTopicReplicas = new ArrayList<>();
// Set broker id 1 as the first entry which is taken as the leader.
followerTopicReplicas.add(1);
followerTopicReplicas.add(2);
followerTopicReplicas.add(0);
assignedFollowerTopicReplicas.put(0, JavaConverters.asScalaBuffer(followerTopicReplicas));
remoteLogMetadataManagerHarness.createTopicWithAssignment(followerTopic, JavaConverters.mapAsScalaMap(assignedFollowerTopicReplicas), remoteLogMetadataManagerHarness.listenerName());
final TopicIdPartition leaderTopicIdPartition = new TopicIdPartition(Uuid.randomUuid(), new TopicPartition(leaderTopic, 0));
final TopicIdPartition followerTopicIdPartition = new TopicIdPartition(Uuid.randomUuid(), new TopicPartition(followerTopic, 0));
// Register these partitions to RLMM.
topicBasedRlmm().onPartitionLeadershipChanges(Collections.singleton(leaderTopicIdPartition), Collections.singleton(followerTopicIdPartition));
// Add segments for these partitions but they are not available as they have not yet been subscribed.
RemoteLogSegmentMetadata leaderSegmentMetadata = new RemoteLogSegmentMetadata(new RemoteLogSegmentId(leaderTopicIdPartition, Uuid.randomUuid()), 0, 100, -1L, 0, time.milliseconds(), SEG_SIZE, Collections.singletonMap(0, 0L));
topicBasedRlmm().addRemoteLogSegmentMetadata(leaderSegmentMetadata).get();
RemoteLogSegmentMetadata followerSegmentMetadata = new RemoteLogSegmentMetadata(new RemoteLogSegmentId(followerTopicIdPartition, Uuid.randomUuid()), 0, 100, -1L, 0, time.milliseconds(), SEG_SIZE, Collections.singletonMap(0, 0L));
topicBasedRlmm().addRemoteLogSegmentMetadata(followerSegmentMetadata).get();
// Stop TopicBasedRemoteLogMetadataManager only.
stopTopicBasedRemoteLogMetadataManagerHarness();
// Start TopicBasedRemoteLogMetadataManager but do not start consumer thread to check whether the stored metadata is
// loaded successfully or not.
startTopicBasedRemoteLogMetadataManagerHarness(false);
// Register these partitions to RLMM, which loads the respective metadata snapshots.
topicBasedRlmm().onPartitionLeadershipChanges(Collections.singleton(leaderTopicIdPartition), Collections.singleton(followerTopicIdPartition));
// Check for the stored entries from the earlier run.
Assertions.assertTrue(TestUtils.sameElementsWithoutOrder(Collections.singleton(leaderSegmentMetadata).iterator(), topicBasedRlmm().listRemoteLogSegments(leaderTopicIdPartition)));
Assertions.assertTrue(TestUtils.sameElementsWithoutOrder(Collections.singleton(followerSegmentMetadata).iterator(), topicBasedRlmm().listRemoteLogSegments(followerTopicIdPartition)));
// Check whether the check-pointed consumer offsets are stored or not.
Path committedOffsetsPath = new File(logDir, COMMITTED_OFFSETS_FILE_NAME).toPath();
Assertions.assertTrue(committedOffsetsPath.toFile().exists());
CommittedOffsetsFile committedOffsetsFile = new CommittedOffsetsFile(committedOffsetsPath.toFile());
int metadataPartition1 = topicBasedRlmm().metadataPartition(leaderTopicIdPartition);
int metadataPartition2 = topicBasedRlmm().metadataPartition(followerTopicIdPartition);
Optional<Long> receivedOffsetForPartition1 = topicBasedRlmm().receivedOffsetForPartition(metadataPartition1);
Optional<Long> receivedOffsetForPartition2 = topicBasedRlmm().receivedOffsetForPartition(metadataPartition2);
Assertions.assertTrue(receivedOffsetForPartition1.isPresent());
Assertions.assertTrue(receivedOffsetForPartition2.isPresent());
// Make sure these offsets are at least 0.
Assertions.assertTrue(receivedOffsetForPartition1.get() >= 0);
Assertions.assertTrue(receivedOffsetForPartition2.get() >= 0);
// Check the stored entries and the offsets that were set on consumer are the same.
Map<Integer, Long> partitionToOffset = committedOffsetsFile.readEntries();
Assertions.assertEquals(partitionToOffset.get(metadataPartition1), receivedOffsetForPartition1.get());
Assertions.assertEquals(partitionToOffset.get(metadataPartition2), receivedOffsetForPartition2.get());
// Start Consumer thread
topicBasedRlmm().startConsumerThread();
// Add one more segment
RemoteLogSegmentMetadata leaderSegmentMetadata2 = new RemoteLogSegmentMetadata(new RemoteLogSegmentId(leaderTopicIdPartition, Uuid.randomUuid()), 101, 200, -1L, 0, time.milliseconds(), SEG_SIZE, Collections.singletonMap(0, 101L));
topicBasedRlmm().addRemoteLogSegmentMetadata(leaderSegmentMetadata2).get();
// Check that both the stored segment and recently added segment are available.
Assertions.assertTrue(TestUtils.sameElementsWithoutOrder(Arrays.asList(leaderSegmentMetadata, leaderSegmentMetadata2).iterator(), topicBasedRlmm().listRemoteLogSegments(leaderTopicIdPartition)));
}
use of scala.collection.Seq in project flink by apache.
the class BatchPhysicalPythonAggregateRule method convert.
@Override
public RelNode convert(RelNode relNode) {
FlinkLogicalAggregate agg = (FlinkLogicalAggregate) relNode;
RelNode input = agg.getInput();
int[] groupSet = agg.getGroupSet().toArray();
RelTraitSet traitSet = relNode.getTraitSet().replace(FlinkConventions.BATCH_PHYSICAL());
Tuple2<int[], Seq<AggregateCall>> auxGroupSetAndCallsTuple = AggregateUtil.checkAndSplitAggCalls(agg);
int[] auxGroupSet = auxGroupSetAndCallsTuple._1;
Seq<AggregateCall> aggCallsWithoutAuxGroupCalls = auxGroupSetAndCallsTuple._2;
Tuple3<int[][], DataType[][], UserDefinedFunction[]> aggBufferTypesAndFunctions = AggregateUtil.transformToBatchAggregateFunctions(FlinkTypeFactory.toLogicalRowType(input.getRowType()), aggCallsWithoutAuxGroupCalls, null);
UserDefinedFunction[] aggFunctions = aggBufferTypesAndFunctions._3();
RelTraitSet requiredTraitSet = input.getTraitSet().replace(FlinkConventions.BATCH_PHYSICAL());
if (groupSet.length != 0) {
FlinkRelDistribution requiredDistribution = FlinkRelDistribution.hash(groupSet, false);
requiredTraitSet = requiredTraitSet.replace(requiredDistribution);
RelCollation sortCollation = createRelCollation(groupSet);
requiredTraitSet = requiredTraitSet.replace(sortCollation);
} else {
requiredTraitSet = requiredTraitSet.replace(FlinkRelDistribution.SINGLETON());
}
RelNode convInput = RelOptRule.convert(input, requiredTraitSet);
return new BatchPhysicalPythonGroupAggregate(relNode.getCluster(), traitSet, convInput, agg.getRowType(), convInput.getRowType(), convInput.getRowType(), groupSet, auxGroupSet, aggCallsWithoutAuxGroupCalls, aggFunctions);
}
use of scala.collection.Seq in project flink by apache.
the class BatchPhysicalPythonWindowAggregateRule method onMatch.
@Override
public void onMatch(RelOptRuleCall call) {
FlinkLogicalWindowAggregate agg = call.rel(0);
RelNode input = agg.getInput();
LogicalWindow window = agg.getWindow();
if (!(window instanceof TumblingGroupWindow && AggregateUtil.hasTimeIntervalType(((TumblingGroupWindow) window).size()) || window instanceof SlidingGroupWindow && AggregateUtil.hasTimeIntervalType(((SlidingGroupWindow) window).size()) || window instanceof SessionGroupWindow)) {
// sliding & tumbling count window and session window not supported
throw new TableException("Window " + window + " is not supported right now.");
}
int[] groupSet = agg.getGroupSet().toArray();
RelTraitSet traitSet = agg.getTraitSet().replace(FlinkConventions.BATCH_PHYSICAL());
Tuple2<int[], Seq<AggregateCall>> auxGroupSetAndCallsTuple = AggregateUtil.checkAndSplitAggCalls(agg);
int[] auxGroupSet = auxGroupSetAndCallsTuple._1;
Seq<AggregateCall> aggCallsWithoutAuxGroupCalls = auxGroupSetAndCallsTuple._2;
Tuple3<int[][], DataType[][], UserDefinedFunction[]> aggBufferTypesAndFunctions = AggregateUtil.transformToBatchAggregateFunctions(FlinkTypeFactory.toLogicalRowType(input.getRowType()), aggCallsWithoutAuxGroupCalls, null);
UserDefinedFunction[] aggFunctions = aggBufferTypesAndFunctions._3();
int inputTimeFieldIndex = AggregateUtil.timeFieldIndex(input.getRowType(), call.builder(), window.timeAttribute());
RelDataType inputTimeFieldType = input.getRowType().getFieldList().get(inputTimeFieldIndex).getType();
boolean inputTimeIsDate = inputTimeFieldType.getSqlTypeName() == SqlTypeName.DATE;
RelTraitSet requiredTraitSet = agg.getTraitSet().replace(FlinkConventions.BATCH_PHYSICAL());
if (groupSet.length != 0) {
FlinkRelDistribution requiredDistribution = FlinkRelDistribution.hash(groupSet, false);
requiredTraitSet = requiredTraitSet.replace(requiredDistribution);
} else {
requiredTraitSet = requiredTraitSet.replace(FlinkRelDistribution.SINGLETON());
}
RelCollation sortCollation = createRelCollation(groupSet, inputTimeFieldIndex);
requiredTraitSet = requiredTraitSet.replace(sortCollation);
RelNode newInput = RelOptRule.convert(input, requiredTraitSet);
BatchPhysicalPythonGroupWindowAggregate windowAgg = new BatchPhysicalPythonGroupWindowAggregate(agg.getCluster(), traitSet, newInput, agg.getRowType(), newInput.getRowType(), groupSet, auxGroupSet, aggCallsWithoutAuxGroupCalls, aggFunctions, window, inputTimeFieldIndex, inputTimeIsDate, agg.getNamedProperties());
call.transformTo(windowAgg);
}
Aggregations