Search in sources :

Example 61 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class DataSetUtils method summarize.

// --------------------------------------------------------------------------------------------
// Summarize
// --------------------------------------------------------------------------------------------
/**
 * Summarize a DataSet of Tuples by collecting single pass statistics for all columns.
 *
 * <p>Example usage:
 *
 * <pre>{@code
 * Dataset<Tuple3<Double, String, Boolean>> input = // [...]
 * Tuple3<NumericColumnSummary,StringColumnSummary, BooleanColumnSummary> summary = DataSetUtils.summarize(input)
 *
 * summary.f0.getStandardDeviation()
 * summary.f1.getMaxLength()
 * }</pre>
 *
 * @return the summary as a Tuple the same width as input rows
 */
public static <R extends Tuple, T extends Tuple> R summarize(DataSet<T> input) throws Exception {
    if (!input.getType().isTupleType()) {
        throw new IllegalArgumentException("summarize() is only implemented for DataSet's of Tuples");
    }
    final TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();
    DataSet<TupleSummaryAggregator<R>> result = input.mapPartition(new MapPartitionFunction<T, TupleSummaryAggregator<R>>() {

        @Override
        public void mapPartition(Iterable<T> values, Collector<TupleSummaryAggregator<R>> out) throws Exception {
            TupleSummaryAggregator<R> aggregator = SummaryAggregatorFactory.create(inType);
            for (Tuple value : values) {
                aggregator.aggregate(value);
            }
            out.collect(aggregator);
        }
    }).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {

        @Override
        public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
            agg1.combine(agg2);
            return agg1;
        }
    });
    return result.collect().get(0).result();
}
Also used : RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) MapPartitionFunction(org.apache.flink.api.common.functions.MapPartitionFunction) TupleTypeInfoBase(org.apache.flink.api.java.typeutils.TupleTypeInfoBase) Collector(org.apache.flink.util.Collector) TupleSummaryAggregator(org.apache.flink.api.java.summarize.aggregation.TupleSummaryAggregator) Tuple(org.apache.flink.api.java.tuple.Tuple)

Example 62 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class KubernetesHighAvailabilityRecoverFromSavepointITCase method createJobGraph.

private JobGraph createJobGraph() throws Exception {
    final StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
    final StateBackend stateBackend = new FsStateBackend(temporaryFolder.newFolder().toURI(), 1);
    sEnv.setStateBackend(stateBackend);
    sEnv.addSource(new InfiniteSourceFunction()).keyBy(e -> e).flatMap(new RichFlatMapFunction<Integer, Integer>() {

        private static final long serialVersionUID = 1L;

        ValueState<Integer> state;

        @Override
        public void open(Configuration parameters) throws Exception {
            super.open(parameters);
            ValueStateDescriptor<Integer> descriptor = new ValueStateDescriptor<>("total", Types.INT);
            state = getRuntimeContext().getState(descriptor);
        }

        @Override
        public void flatMap(Integer value, Collector<Integer> out) throws Exception {
            final Integer current = state.value();
            if (current != null) {
                value += current;
            }
            state.update(value);
            out.collect(value);
        }
    }).uid(FLAT_MAP_UID).addSink(new DiscardingSink<>());
    return sEnv.getStreamGraph().getJobGraph();
}
Also used : RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) Deadline(org.apache.flink.api.common.time.Deadline) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Random(java.util.Random) JobStatus(org.apache.flink.api.common.JobStatus) FunctionSnapshotContext(org.apache.flink.runtime.state.FunctionSnapshotContext) MiniClusterResourceConfiguration(org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration) BasicTypeInfo(org.apache.flink.api.common.typeinfo.BasicTypeInfo) ListState(org.apache.flink.api.common.state.ListState) StateBackend(org.apache.flink.runtime.state.StateBackend) Collector(org.apache.flink.util.Collector) Duration(java.time.Duration) RichParallelSourceFunction(org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction) TestLogger(org.apache.flink.util.TestLogger) ListStateDescriptor(org.apache.flink.api.common.state.ListStateDescriptor) ClassRule(org.junit.ClassRule) Before(org.junit.Before) Types(org.apache.flink.api.common.typeinfo.Types) MiniClusterWithClientResource(org.apache.flink.test.util.MiniClusterWithClientResource) KubernetesConfigOptions(org.apache.flink.kubernetes.configuration.KubernetesConfigOptions) CheckpointedFunction(org.apache.flink.streaming.api.checkpoint.CheckpointedFunction) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) DiscardingSink(org.apache.flink.streaming.api.functions.sink.DiscardingSink) ValueStateDescriptor(org.apache.flink.api.common.state.ValueStateDescriptor) FunctionInitializationContext(org.apache.flink.runtime.state.FunctionInitializationContext) Configuration(org.apache.flink.configuration.Configuration) Test(org.junit.Test) IOException(java.io.IOException) FsStateBackend(org.apache.flink.runtime.state.filesystem.FsStateBackend) TimeUnit(java.util.concurrent.TimeUnit) TestingUtils(org.apache.flink.testutils.TestingUtils) JobID(org.apache.flink.api.common.JobID) Rule(org.junit.Rule) ValueState(org.apache.flink.api.common.state.ValueState) ClusterClient(org.apache.flink.client.program.ClusterClient) KubernetesResource(org.apache.flink.kubernetes.KubernetesResource) CommonTestUtils(org.apache.flink.runtime.testutils.CommonTestUtils) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) HighAvailabilityOptions(org.apache.flink.configuration.HighAvailabilityOptions) TemporaryFolder(org.junit.rules.TemporaryFolder) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) MiniClusterResourceConfiguration(org.apache.flink.runtime.testutils.MiniClusterResourceConfiguration) Configuration(org.apache.flink.configuration.Configuration) StateBackend(org.apache.flink.runtime.state.StateBackend) FsStateBackend(org.apache.flink.runtime.state.filesystem.FsStateBackend) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) IOException(java.io.IOException) ValueStateDescriptor(org.apache.flink.api.common.state.ValueStateDescriptor) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) FsStateBackend(org.apache.flink.runtime.state.filesystem.FsStateBackend)

Example 63 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class KinesisConsumerTest method testKinesisConsumerThrowsExceptionIfSchemaImplementsCollector.

@Test
public void testKinesisConsumerThrowsExceptionIfSchemaImplementsCollector() {
    DeserializationSchema<Object> schemaWithCollector = new DeserializationSchema<Object>() {

        @Override
        public Object deserialize(byte[] message) throws IOException {
            return null;
        }

        @Override
        public void deserialize(byte[] message, Collector<Object> out) throws IOException {
        // we do not care about the implementation. we should just check if this
        // method is declared
        }

        @Override
        public boolean isEndOfStream(Object nextElement) {
            return false;
        }

        @Override
        public TypeInformation<Object> getProducedType() {
            return null;
        }
    };
    thrown.expect(IllegalArgumentException.class);
    thrown.expectMessage("Kinesis consumer does not support DeserializationSchema that implements deserialization with a" + " Collector. Unsupported DeserializationSchema: " + "org.apache.flink.streaming.connectors.kinesis.KinesisConsumerTest");
    new FlinkKinesisConsumer<>("fakeStream", schemaWithCollector, new Properties());
}
Also used : Collector(org.apache.flink.util.Collector) Properties(java.util.Properties) DeserializationSchema(org.apache.flink.api.common.serialization.DeserializationSchema) Test(org.junit.Test)

Example 64 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class OggJsonSerDeSchemaTest method testDeserializationWithMetadata.

public void testDeserializationWithMetadata(String resourceFile) throws Exception {
    // we only read the first line for keeping the test simple
    final String firstLine = readLines(resourceFile).get(0);
    final List<ReadableMetadata> requestedMetadata = Arrays.asList(ReadableMetadata.values());
    final DataType producedDataTypes = DataTypeUtils.appendRowFields(PHYSICAL_DATA_TYPE, requestedMetadata.stream().map(m -> DataTypes.FIELD(m.key, m.dataType)).collect(Collectors.toList()));
    final OggJsonDeserializationSchema deserializationSchema = new OggJsonDeserializationSchema(PHYSICAL_DATA_TYPE, requestedMetadata, InternalTypeInfo.of(producedDataTypes.getLogicalType()), false, TimestampFormat.ISO_8601);
    final SimpleCollector collector = new SimpleCollector();
    deserializationSchema.deserialize(firstLine.getBytes(StandardCharsets.UTF_8), collector);
    assertEquals(1, collector.list.size());
    Consumer<RowData> consumer = row -> {
        assertEquals(101, row.getInt(0));
        assertEquals("scooter", row.getString(1).toString());
        assertEquals("Small 2-wheel scooter", row.getString(2).toString());
        assertEquals(3.140000104904175, row.getFloat(3), 1e-15);
        assertEquals("OGG.TBL_TEST", row.getString(4).toString());
        assertEquals("id", row.getArray(5).getString(0).toString());
        assertEquals(1589377175766L, row.getTimestamp(6, 6).getMillisecond());
        assertEquals(1589384406000L, row.getTimestamp(7, 6).getMillisecond());
    };
    consumer.accept(collector.list.get(0));
}
Also used : DataType(org.apache.flink.table.types.DataType) STRING(org.apache.flink.table.api.DataTypes.STRING) Arrays(java.util.Arrays) FLOAT(org.apache.flink.table.api.DataTypes.FLOAT) URL(java.net.URL) ROW(org.apache.flink.table.api.DataTypes.ROW) RowType(org.apache.flink.table.types.logical.RowType) ArrayList(java.util.ArrayList) JsonFormatOptions(org.apache.flink.formats.json.JsonFormatOptions) Collector(org.apache.flink.util.Collector) ReadableMetadata(org.apache.flink.formats.json.ogg.OggJsonDecodingFormat.ReadableMetadata) ExpectedException(org.junit.rules.ExpectedException) Path(java.nio.file.Path) FIELD(org.apache.flink.table.api.DataTypes.FIELD) INT(org.apache.flink.table.api.DataTypes.INT) DataTypeUtils(org.apache.flink.table.types.utils.DataTypeUtils) RowData(org.apache.flink.table.data.RowData) Files(java.nio.file.Files) Assert.assertTrue(org.junit.Assert.assertTrue) DataTypes(org.apache.flink.table.api.DataTypes) Test(org.junit.Test) IOException(java.io.IOException) TimestampFormat(org.apache.flink.formats.common.TimestampFormat) Collectors(java.util.stream.Collectors) File(java.io.File) StandardCharsets(java.nio.charset.StandardCharsets) Consumer(java.util.function.Consumer) List(java.util.List) InternalTypeInfo(org.apache.flink.table.runtime.typeutils.InternalTypeInfo) Rule(org.junit.Rule) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) RowData(org.apache.flink.table.data.RowData) DataType(org.apache.flink.table.types.DataType) ReadableMetadata(org.apache.flink.formats.json.ogg.OggJsonDecodingFormat.ReadableMetadata)

Example 65 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class MaxwellJsonSerDerTest method testDeserializationWithMetadata.

@Test
public void testDeserializationWithMetadata() throws Exception {
    // we only read the first line for keeping the test simple
    final String firstLine = readLines("maxwell-data.txt").get(0);
    final List<ReadableMetadata> requestedMetadata = Arrays.asList(ReadableMetadata.values());
    final DataType producedDataType = DataTypeUtils.appendRowFields(PHYSICAL_DATA_TYPE, requestedMetadata.stream().map(m -> DataTypes.FIELD(m.key, m.dataType)).collect(Collectors.toList()));
    final MaxwellJsonDeserializationSchema deserializationSchema = new MaxwellJsonDeserializationSchema(PHYSICAL_DATA_TYPE, requestedMetadata, InternalTypeInfo.of(producedDataType.getLogicalType()), false, TimestampFormat.ISO_8601);
    final SimpleCollector collector = new SimpleCollector();
    deserializationSchema.deserialize(firstLine.getBytes(StandardCharsets.UTF_8), collector);
    assertEquals(1, collector.list.size());
    Consumer<RowData> consumer = row -> {
        assertThat(row.getInt(0), equalTo(101));
        assertThat(row.getString(1).toString(), equalTo("scooter"));
        assertThat(row.getString(2).toString(), equalTo("Small 2-wheel scooter"));
        assertThat(row.getFloat(3), equalTo(3.14f));
        assertThat(row.getString(4).toString(), equalTo("test"));
        assertThat(row.getString(5).toString(), equalTo("product"));
        assertThat(row.getArray(6).getString(0).toString(), equalTo("id"));
        assertThat(row.getTimestamp(7, 3).getMillisecond(), equalTo(1596684883000L));
    };
    consumer.accept(collector.list.get(0));
}
Also used : DataType(org.apache.flink.table.types.DataType) STRING(org.apache.flink.table.api.DataTypes.STRING) Arrays(java.util.Arrays) FLOAT(org.apache.flink.table.api.DataTypes.FLOAT) URL(java.net.URL) ROW(org.apache.flink.table.api.DataTypes.ROW) CoreMatchers.equalTo(org.hamcrest.CoreMatchers.equalTo) RowType(org.apache.flink.table.types.logical.RowType) ArrayList(java.util.ArrayList) JsonFormatOptions(org.apache.flink.formats.json.JsonFormatOptions) Collector(org.apache.flink.util.Collector) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Path(java.nio.file.Path) FIELD(org.apache.flink.table.api.DataTypes.FIELD) INT(org.apache.flink.table.api.DataTypes.INT) DataTypeUtils(org.apache.flink.table.types.utils.DataTypeUtils) RowData(org.apache.flink.table.data.RowData) Files(java.nio.file.Files) ReadableMetadata(org.apache.flink.formats.json.maxwell.MaxwellJsonDecodingFormat.ReadableMetadata) DataTypes(org.apache.flink.table.api.DataTypes) Test(org.junit.Test) IOException(java.io.IOException) TimestampFormat(org.apache.flink.formats.common.TimestampFormat) Collectors(java.util.stream.Collectors) File(java.io.File) StandardCharsets(java.nio.charset.StandardCharsets) Consumer(java.util.function.Consumer) List(java.util.List) InternalTypeInfo(org.apache.flink.table.runtime.typeutils.InternalTypeInfo) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) RowData(org.apache.flink.table.data.RowData) DataType(org.apache.flink.table.types.DataType) ReadableMetadata(org.apache.flink.formats.json.maxwell.MaxwellJsonDecodingFormat.ReadableMetadata) Test(org.junit.Test)

Aggregations

Collector (org.apache.flink.util.Collector)80 Test (org.junit.Test)60 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)33 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)32 Configuration (org.apache.flink.configuration.Configuration)27 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)19 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)18 ArrayList (java.util.ArrayList)16 DataStream (org.apache.flink.streaming.api.datastream.DataStream)16 TimeWindow (org.apache.flink.streaming.api.windowing.windows.TimeWindow)16 HashMap (java.util.HashMap)14 List (java.util.List)14 RuntimeContext (org.apache.flink.api.common.functions.RuntimeContext)14 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)12 IOException (java.io.IOException)11 Arrays (java.util.Arrays)11 Map (java.util.Map)11 FlatMapFunction (org.apache.flink.api.common.functions.FlatMapFunction)11 Assert.assertTrue (org.junit.Assert.assertTrue)11 InternalWindowFunction (org.apache.flink.streaming.runtime.operators.windowing.functions.InternalWindowFunction)10