Search in sources :

Example 1 with WordTupleParserFactory

use of org.apache.hyracks.examples.text.WordTupleParserFactory in project asterixdb by apache.

the class WordCountMain method createJob.

private static JobSpecification createJob(FileSplit[] inSplits, FileSplit[] outSplits, String algo, int htSize, int frameLimit, String format, int frameSize) {
    JobSpecification spec = new JobSpecification(frameSize);
    IFileSplitProvider splitsProvider = new ConstantFileSplitProvider(inSplits);
    RecordDescriptor wordDesc = new RecordDescriptor(new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer() });
    FileScanOperatorDescriptor wordScanner = new FileScanOperatorDescriptor(spec, splitsProvider, new WordTupleParserFactory(), wordDesc);
    createPartitionConstraint(spec, wordScanner, inSplits);
    RecordDescriptor groupResultDesc = new RecordDescriptor(new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE });
    IOperatorDescriptor gBy;
    int[] keys = new int[] { 0 };
    if ("hash".equalsIgnoreCase(algo)) {
        gBy = new ExternalGroupOperatorDescriptor(spec, htSize, fileSize, keys, frameLimit, new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY) }, new UTF8StringNormalizedKeyComputerFactory(), new MultiFieldsAggregatorFactory(new IFieldAggregateDescriptorFactory[] { new IntSumFieldAggregatorFactory(1, false), new IntSumFieldAggregatorFactory(3, false), new FloatSumFieldAggregatorFactory(5, false) }), new MultiFieldsAggregatorFactory(new IFieldAggregateDescriptorFactory[] { new IntSumFieldAggregatorFactory(1, false), new IntSumFieldAggregatorFactory(2, false), new FloatSumFieldAggregatorFactory(3, false) }), groupResultDesc, groupResultDesc, new HashSpillableTableFactory(new IBinaryHashFunctionFamily[] { UTF8StringBinaryHashFunctionFamily.INSTANCE }));
        createPartitionConstraint(spec, gBy, outSplits);
        IConnectorDescriptor scanGroupConn = new MToNPartitioningConnectorDescriptor(spec, new FieldHashPartitionComputerFactory(keys, new IBinaryHashFunctionFactory[] { PointableBinaryHashFunctionFactory.of(UTF8StringPointable.FACTORY) }));
        spec.connect(scanGroupConn, wordScanner, 0, gBy, 0);
    } else {
        IBinaryComparatorFactory[] cfs = new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY) };
        IOperatorDescriptor sorter = "memsort".equalsIgnoreCase(algo) ? new InMemorySortOperatorDescriptor(spec, keys, new UTF8StringNormalizedKeyComputerFactory(), cfs, wordDesc) : new ExternalSortOperatorDescriptor(spec, frameLimit, keys, new UTF8StringNormalizedKeyComputerFactory(), cfs, wordDesc);
        createPartitionConstraint(spec, sorter, outSplits);
        IConnectorDescriptor scanSortConn = new MToNPartitioningConnectorDescriptor(spec, new FieldHashPartitionComputerFactory(keys, new IBinaryHashFunctionFactory[] { PointableBinaryHashFunctionFactory.of(UTF8StringPointable.FACTORY) }));
        spec.connect(scanSortConn, wordScanner, 0, sorter, 0);
        gBy = new PreclusteredGroupOperatorDescriptor(spec, keys, new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY) }, new MultiFieldsAggregatorFactory(new IFieldAggregateDescriptorFactory[] { new CountFieldAggregatorFactory(true) }), groupResultDesc);
        createPartitionConstraint(spec, gBy, outSplits);
        OneToOneConnectorDescriptor sortGroupConn = new OneToOneConnectorDescriptor(spec);
        spec.connect(sortGroupConn, sorter, 0, gBy, 0);
    }
    IFileSplitProvider outSplitProvider = new ConstantFileSplitProvider(outSplits);
    IOperatorDescriptor writer = "text".equalsIgnoreCase(format) ? new PlainFileWriterOperatorDescriptor(spec, outSplitProvider, ",") : new FrameFileWriterOperatorDescriptor(spec, outSplitProvider);
    createPartitionConstraint(spec, writer, outSplits);
    IConnectorDescriptor gbyPrinterConn = new OneToOneConnectorDescriptor(spec);
    spec.connect(gbyPrinterConn, gBy, 0, writer, 0);
    spec.addRoot(writer);
    return spec;
}
Also used : WordTupleParserFactory(org.apache.hyracks.examples.text.WordTupleParserFactory) IFileSplitProvider(org.apache.hyracks.dataflow.std.file.IFileSplitProvider) RecordDescriptor(org.apache.hyracks.api.dataflow.value.RecordDescriptor) HashSpillableTableFactory(org.apache.hyracks.dataflow.std.group.HashSpillableTableFactory) CountFieldAggregatorFactory(org.apache.hyracks.dataflow.std.group.aggregators.CountFieldAggregatorFactory) OneToOneConnectorDescriptor(org.apache.hyracks.dataflow.std.connectors.OneToOneConnectorDescriptor) UTF8StringSerializerDeserializer(org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer) IBinaryHashFunctionFactory(org.apache.hyracks.api.dataflow.value.IBinaryHashFunctionFactory) FileScanOperatorDescriptor(org.apache.hyracks.dataflow.std.file.FileScanOperatorDescriptor) JobSpecification(org.apache.hyracks.api.job.JobSpecification) UTF8StringNormalizedKeyComputerFactory(org.apache.hyracks.dataflow.common.data.normalizers.UTF8StringNormalizedKeyComputerFactory) IConnectorDescriptor(org.apache.hyracks.api.dataflow.IConnectorDescriptor) MultiFieldsAggregatorFactory(org.apache.hyracks.dataflow.std.group.aggregators.MultiFieldsAggregatorFactory) ExternalGroupOperatorDescriptor(org.apache.hyracks.dataflow.std.group.external.ExternalGroupOperatorDescriptor) InMemorySortOperatorDescriptor(org.apache.hyracks.dataflow.std.sort.InMemorySortOperatorDescriptor) ConstantFileSplitProvider(org.apache.hyracks.dataflow.std.file.ConstantFileSplitProvider) IBinaryComparatorFactory(org.apache.hyracks.api.dataflow.value.IBinaryComparatorFactory) MToNPartitioningConnectorDescriptor(org.apache.hyracks.dataflow.std.connectors.MToNPartitioningConnectorDescriptor) FieldHashPartitionComputerFactory(org.apache.hyracks.dataflow.common.data.partition.FieldHashPartitionComputerFactory) IOperatorDescriptor(org.apache.hyracks.api.dataflow.IOperatorDescriptor) PlainFileWriterOperatorDescriptor(org.apache.hyracks.dataflow.std.file.PlainFileWriterOperatorDescriptor) FloatSumFieldAggregatorFactory(org.apache.hyracks.dataflow.std.group.aggregators.FloatSumFieldAggregatorFactory) ExternalSortOperatorDescriptor(org.apache.hyracks.dataflow.std.sort.ExternalSortOperatorDescriptor) PreclusteredGroupOperatorDescriptor(org.apache.hyracks.dataflow.std.group.preclustered.PreclusteredGroupOperatorDescriptor) IntSumFieldAggregatorFactory(org.apache.hyracks.dataflow.std.group.aggregators.IntSumFieldAggregatorFactory) FrameFileWriterOperatorDescriptor(org.apache.hyracks.dataflow.std.file.FrameFileWriterOperatorDescriptor)

Aggregations

IConnectorDescriptor (org.apache.hyracks.api.dataflow.IConnectorDescriptor)1 IOperatorDescriptor (org.apache.hyracks.api.dataflow.IOperatorDescriptor)1 IBinaryComparatorFactory (org.apache.hyracks.api.dataflow.value.IBinaryComparatorFactory)1 IBinaryHashFunctionFactory (org.apache.hyracks.api.dataflow.value.IBinaryHashFunctionFactory)1 RecordDescriptor (org.apache.hyracks.api.dataflow.value.RecordDescriptor)1 JobSpecification (org.apache.hyracks.api.job.JobSpecification)1 UTF8StringSerializerDeserializer (org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer)1 UTF8StringNormalizedKeyComputerFactory (org.apache.hyracks.dataflow.common.data.normalizers.UTF8StringNormalizedKeyComputerFactory)1 FieldHashPartitionComputerFactory (org.apache.hyracks.dataflow.common.data.partition.FieldHashPartitionComputerFactory)1 MToNPartitioningConnectorDescriptor (org.apache.hyracks.dataflow.std.connectors.MToNPartitioningConnectorDescriptor)1 OneToOneConnectorDescriptor (org.apache.hyracks.dataflow.std.connectors.OneToOneConnectorDescriptor)1 ConstantFileSplitProvider (org.apache.hyracks.dataflow.std.file.ConstantFileSplitProvider)1 FileScanOperatorDescriptor (org.apache.hyracks.dataflow.std.file.FileScanOperatorDescriptor)1 FrameFileWriterOperatorDescriptor (org.apache.hyracks.dataflow.std.file.FrameFileWriterOperatorDescriptor)1 IFileSplitProvider (org.apache.hyracks.dataflow.std.file.IFileSplitProvider)1 PlainFileWriterOperatorDescriptor (org.apache.hyracks.dataflow.std.file.PlainFileWriterOperatorDescriptor)1 HashSpillableTableFactory (org.apache.hyracks.dataflow.std.group.HashSpillableTableFactory)1 CountFieldAggregatorFactory (org.apache.hyracks.dataflow.std.group.aggregators.CountFieldAggregatorFactory)1 FloatSumFieldAggregatorFactory (org.apache.hyracks.dataflow.std.group.aggregators.FloatSumFieldAggregatorFactory)1 IntSumFieldAggregatorFactory (org.apache.hyracks.dataflow.std.group.aggregators.IntSumFieldAggregatorFactory)1