use of org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer in project asterixdb by apache.
the class PigletMetadataProvider method getScannerRuntime.
@Override
public Pair<IOperatorDescriptor, AlgebricksPartitionConstraint> getScannerRuntime(IDataSource<String> dataSource, List<LogicalVariable> scanVariables, List<LogicalVariable> projectVariables, boolean projectPushed, List<LogicalVariable> minFilterVars, List<LogicalVariable> maxFilterVars, IOperatorSchema opSchema, IVariableTypeEnvironment typeEnv, JobGenContext context, JobSpecification jobSpec, Object implConfig) throws AlgebricksException {
PigletFileDataSource ds = (PigletFileDataSource) dataSource;
FileSplit[] fileSplits = ds.getFileSplits();
String[] locations = new String[fileSplits.length];
for (int i = 0; i < fileSplits.length; ++i) {
locations[i] = fileSplits[i].getNodeName();
}
IFileSplitProvider fsp = new ConstantFileSplitProvider(fileSplits);
Object[] colTypes = ds.getSchemaTypes();
IValueParserFactory[] vpfs = new IValueParserFactory[colTypes.length];
ISerializerDeserializer[] serDesers = new ISerializerDeserializer[colTypes.length];
for (int i = 0; i < colTypes.length; ++i) {
Type colType = (Type) colTypes[i];
IValueParserFactory vpf;
ISerializerDeserializer serDeser;
switch(colType.getTag()) {
case INTEGER:
vpf = IntegerParserFactory.INSTANCE;
serDeser = IntegerSerializerDeserializer.INSTANCE;
break;
case CHAR_ARRAY:
vpf = UTF8StringParserFactory.INSTANCE;
serDeser = new UTF8StringSerializerDeserializer();
break;
case FLOAT:
vpf = FloatParserFactory.INSTANCE;
serDeser = FloatSerializerDeserializer.INSTANCE;
break;
default:
throw new UnsupportedOperationException();
}
vpfs[i] = vpf;
serDesers[i] = serDeser;
}
ITupleParserFactory tpf = new DelimitedDataTupleParserFactory(vpfs, ',');
RecordDescriptor rDesc = new RecordDescriptor(serDesers);
IOperatorDescriptor scanner = new FileScanOperatorDescriptor(jobSpec, fsp, tpf, rDesc);
AlgebricksAbsolutePartitionConstraint constraint = new AlgebricksAbsolutePartitionConstraint(locations);
return new Pair<>(scanner, constraint);
}
use of org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer in project asterixdb by apache.
the class SecondaryIndexSearchExample method createJob.
private static JobSpecification createJob(Options options) throws HyracksDataException {
JobSpecification spec = new JobSpecification(options.frameSize);
String[] splitNCs = options.ncs.split(",");
IStorageManager storageManager = BTreeHelperStorageManager.INSTANCE;
// schema of tuples coming out of secondary index
RecordDescriptor secondaryRecDesc = new RecordDescriptor(new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE });
int secondaryFieldCount = 2;
ITypeTraits[] secondaryTypeTraits = new ITypeTraits[secondaryFieldCount];
secondaryTypeTraits[0] = UTF8StringPointable.TYPE_TRAITS;
secondaryTypeTraits[1] = IntegerPointable.TYPE_TRAITS;
// comparators for sort fields and BTree fields
IBinaryComparatorFactory[] secondaryComparatorFactories = new IBinaryComparatorFactory[2];
secondaryComparatorFactories[0] = PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY);
secondaryComparatorFactories[1] = PointableBinaryComparatorFactory.of(IntegerPointable.FACTORY);
// comparators for primary index
IBinaryComparatorFactory[] primaryComparatorFactories = new IBinaryComparatorFactory[1];
primaryComparatorFactories[1] = PointableBinaryComparatorFactory.of(IntegerPointable.FACTORY);
// schema of tuples coming out of primary index
RecordDescriptor primaryRecDesc = new RecordDescriptor(new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer() });
int primaryFieldCount = 4;
ITypeTraits[] primaryTypeTraits = new ITypeTraits[primaryFieldCount];
primaryTypeTraits[0] = IntegerPointable.TYPE_TRAITS;
primaryTypeTraits[1] = UTF8StringPointable.TYPE_TRAITS;
primaryTypeTraits[2] = IntegerPointable.TYPE_TRAITS;
primaryTypeTraits[3] = UTF8StringPointable.TYPE_TRAITS;
// comparators for btree, note that we only need a comparator for the
// non-unique key
// i.e. we will have a range condition on the first field only (implying
// [-infinity, +infinity] for the second field)
IBinaryComparatorFactory[] searchComparatorFactories = new IBinaryComparatorFactory[1];
searchComparatorFactories[0] = PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY);
// build tuple containing low and high search keys
// low
ArrayTupleBuilder tb = new ArrayTupleBuilder(searchComparatorFactories.length * 2);
// and
// high
// key
DataOutput dos = tb.getDataOutput();
tb.reset();
// low
new UTF8StringSerializerDeserializer().serialize("0", dos);
// key
tb.addFieldEndOffset();
// high
new UTF8StringSerializerDeserializer().serialize("f", dos);
// key
tb.addFieldEndOffset();
ISerializerDeserializer[] keyRecDescSers = { new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() };
RecordDescriptor keyRecDesc = new RecordDescriptor(keyRecDescSers);
ConstantTupleSourceOperatorDescriptor keyProviderOp = new ConstantTupleSourceOperatorDescriptor(spec, keyRecDesc, tb.getFieldEndOffsets(), tb.getByteArray(), tb.getSize());
JobHelper.createPartitionConstraint(spec, keyProviderOp, splitNCs);
// low key is in field 0 of tuples
int[] secondaryLowKeyFields = { 0 };
// going into secondary index
// search op
// high key is in field 1 of
int[] secondaryHighKeyFields = { 1 };
// tuples going into secondary
// index search op
IFileSplitProvider secondarySplitProvider = JobHelper.createFileSplitProvider(splitNCs, options.secondaryBTreeName);
IIndexDataflowHelperFactory secondaryHelperFactory = new IndexDataflowHelperFactory(storageManager, secondarySplitProvider);
BTreeSearchOperatorDescriptor secondarySearchOp = new BTreeSearchOperatorDescriptor(spec, secondaryRecDesc, secondaryLowKeyFields, secondaryHighKeyFields, true, true, secondaryHelperFactory, false, false, null, NoOpOperationCallbackFactory.INSTANCE, null, null, false);
JobHelper.createPartitionConstraint(spec, secondarySearchOp, splitNCs);
// secondary index will output tuples with [UTF8String, Integer]
// the Integer field refers to the key in the primary index of the
// source data records
// low key is in field 0 of tuples
int[] primaryLowKeyFields = { 1 };
// going into primary index search op
// high key is in field 1 of tuples
int[] primaryHighKeyFields = { 1 };
// going into primary index search
// op
IFileSplitProvider primarySplitProvider = JobHelper.createFileSplitProvider(splitNCs, options.primaryBTreeName);
IIndexDataflowHelperFactory primaryHelperFactory = new IndexDataflowHelperFactory(storageManager, primarySplitProvider);
BTreeSearchOperatorDescriptor primarySearchOp = new BTreeSearchOperatorDescriptor(spec, primaryRecDesc, primaryLowKeyFields, primaryHighKeyFields, true, true, primaryHelperFactory, false, false, null, NoOpOperationCallbackFactory.INSTANCE, null, null, false);
JobHelper.createPartitionConstraint(spec, primarySearchOp, splitNCs);
// have each node print the results of its respective B-Tree
PrinterOperatorDescriptor printer = new PrinterOperatorDescriptor(spec);
JobHelper.createPartitionConstraint(spec, printer, splitNCs);
spec.connect(new OneToOneConnectorDescriptor(spec), keyProviderOp, 0, secondarySearchOp, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), secondarySearchOp, 0, primarySearchOp, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), primarySearchOp, 0, printer, 0);
spec.addRoot(printer);
return spec;
}
use of org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer in project asterixdb by apache.
the class DataGenOperatorDescriptor method createPushRuntime.
@Override
public IOperatorNodePushable createPushRuntime(IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions) throws HyracksDataException {
final FrameTupleAppender appender = new FrameTupleAppender(new VSizeFrame(ctx));
final RecordDescriptor recDesc = outRecDescs[0];
final ArrayTupleBuilder tb = new ArrayTupleBuilder(recDesc.getFields().length);
final Random rnd = new Random(randomSeed);
final int maxUniqueAttempts = 20;
return new AbstractUnaryOutputSourceOperatorNodePushable() {
// for quick & dirty exclusion of duplicates
// WARNING: could contain numRecord entries and use a lot of memory
HashSet<String> stringHs = new HashSet<String>();
HashSet<Integer> intHs = new HashSet<Integer>();
@Override
public void initialize() throws HyracksDataException {
try {
writer.open();
for (int i = 0; i < numRecords; i++) {
tb.reset();
for (int j = 0; j < recDesc.getFieldCount(); j++) {
genField(tb, j);
}
if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
appender.write(writer, true);
if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
throw new HyracksDataException("Record size (" + tb.getSize() + ") larger than frame size (" + appender.getBuffer().capacity() + ")");
}
}
}
appender.write(writer, true);
} catch (Throwable th) {
writer.fail();
throw new HyracksDataException(th);
} finally {
writer.close();
}
}
private void genField(ArrayTupleBuilder tb, int fieldIndex) throws HyracksDataException {
DataOutput dos = tb.getDataOutput();
if (recDesc.getFields()[fieldIndex] instanceof IntegerSerializerDeserializer) {
int val = -1;
if (fieldIndex == uniqueField) {
int attempt = 0;
while (attempt < maxUniqueAttempts) {
int tmp = Math.abs(rnd.nextInt()) % (intMaxVal - intMinVal) + intMinVal;
if (intHs.contains(tmp))
attempt++;
else {
val = tmp;
intHs.add(val);
break;
}
}
if (attempt == maxUniqueAttempts)
throw new HyracksDataException("MaxUnique attempts reached in datagen");
} else {
val = Math.abs(rnd.nextInt()) % (intMaxVal - intMinVal) + intMinVal;
}
recDesc.getFields()[fieldIndex].serialize(val, dos);
tb.addFieldEndOffset();
} else if (recDesc.getFields()[fieldIndex] instanceof UTF8StringSerializerDeserializer) {
String val = null;
if (fieldIndex == uniqueField) {
int attempt = 0;
while (attempt < maxUniqueAttempts) {
String tmp = randomString(maxStrLen, rnd);
if (stringHs.contains(tmp))
attempt++;
else {
val = tmp;
stringHs.add(val);
break;
}
}
if (attempt == maxUniqueAttempts)
throw new HyracksDataException("MaxUnique attempts reached in datagen");
} else {
val = randomString(maxStrLen, rnd);
}
recDesc.getFields()[fieldIndex].serialize(val, dos);
tb.addFieldEndOffset();
} else {
throw new HyracksDataException("Type unsupported in data generator. Only integers and strings allowed");
}
}
private String randomString(int length, Random random) {
String s = Long.toHexString(Double.doubleToLongBits(random.nextDouble()));
StringBuilder strBuilder = new StringBuilder();
for (int i = 0; i < s.length() && i < length; i++) {
strBuilder.append(s.charAt(Math.abs(random.nextInt()) % s.length()));
}
return strBuilder.toString();
}
};
}
use of org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer in project asterixdb by apache.
the class InsertPipelineExample method createJob.
private static JobSpecification createJob(Options options) {
JobSpecification spec = new JobSpecification(options.frameSize);
String[] splitNCs = options.ncs.split(",");
// schema of tuples to be generated: 4 fields with int, string, string,
// string
// we will use field 2 as primary key to fill a clustered index
RecordDescriptor recDesc = new RecordDescriptor(new ISerializerDeserializer[] { // this field will not go into B-Tree
new UTF8StringSerializerDeserializer(), // we will use this as payload
new UTF8StringSerializerDeserializer(), // we will use this field as key
IntegerSerializerDeserializer.INSTANCE, // we will use this as payload
IntegerSerializerDeserializer.INSTANCE, // we will use this as payload
new UTF8StringSerializerDeserializer() });
// generate numRecords records with field 2 being unique, integer values
// in [0, 100000], and strings with max length of 10 characters, and
// random seed 100
DataGenOperatorDescriptor dataGen = new DataGenOperatorDescriptor(spec, recDesc, options.numTuples, 2, 0, 100000, 10, 100);
// run data generator on first nodecontroller given
PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, dataGen, splitNCs[0]);
IStorageManager storageManager = BTreeHelperStorageManager.INSTANCE;
// prepare insertion into primary index
// tuples to be put into B-Tree shall have 4 fields
int primaryFieldCount = 4;
ITypeTraits[] primaryTypeTraits = new ITypeTraits[primaryFieldCount];
primaryTypeTraits[0] = IntegerPointable.TYPE_TRAITS;
primaryTypeTraits[1] = UTF8StringPointable.TYPE_TRAITS;
primaryTypeTraits[2] = IntegerPointable.TYPE_TRAITS;
primaryTypeTraits[3] = UTF8StringPointable.TYPE_TRAITS;
// comparator factories for primary index
IBinaryComparatorFactory[] primaryComparatorFactories = new IBinaryComparatorFactory[1];
primaryComparatorFactories[0] = PointableBinaryComparatorFactory.of(IntegerPointable.FACTORY);
// the B-Tree expects its keyfields to be at the front of its input
// tuple
// map field 2 of input
int[] primaryFieldPermutation = { 2, 1, 3, 4 };
// tuple to field 0 of
// B-Tree tuple, etc.
IFileSplitProvider primarySplitProvider = JobHelper.createFileSplitProvider(splitNCs, options.primaryBTreeName);
IIndexDataflowHelperFactory primaryHelperFactory = new IndexDataflowHelperFactory(storageManager, primarySplitProvider);
// create operator descriptor
TreeIndexInsertUpdateDeleteOperatorDescriptor primaryInsert = new TreeIndexInsertUpdateDeleteOperatorDescriptor(spec, recDesc, primaryFieldPermutation, IndexOperation.INSERT, primaryHelperFactory, null, NoOpOperationCallbackFactory.INSTANCE);
JobHelper.createPartitionConstraint(spec, primaryInsert, splitNCs);
// prepare insertion into secondary index
// tuples to be put into B-Tree shall have 2 fields
int secondaryFieldCount = 2;
ITypeTraits[] secondaryTypeTraits = new ITypeTraits[secondaryFieldCount];
secondaryTypeTraits[0] = UTF8StringPointable.TYPE_TRAITS;
secondaryTypeTraits[1] = IntegerPointable.TYPE_TRAITS;
// comparator factories for secondary index
IBinaryComparatorFactory[] secondaryComparatorFactories = new IBinaryComparatorFactory[2];
secondaryComparatorFactories[0] = PointableBinaryComparatorFactory.of(UTF8StringPointable.FACTORY);
secondaryComparatorFactories[1] = PointableBinaryComparatorFactory.of(IntegerPointable.FACTORY);
// the B-Tree expects its keyfields to be at the front of its input
// tuple
int[] secondaryFieldPermutation = { 1, 2 };
IFileSplitProvider secondarySplitProvider = JobHelper.createFileSplitProvider(splitNCs, options.secondaryBTreeName);
IIndexDataflowHelperFactory secondaryHelperFactory = new IndexDataflowHelperFactory(storageManager, secondarySplitProvider);
// create operator descriptor
TreeIndexInsertUpdateDeleteOperatorDescriptor secondaryInsert = new TreeIndexInsertUpdateDeleteOperatorDescriptor(spec, recDesc, secondaryFieldPermutation, IndexOperation.INSERT, secondaryHelperFactory, null, NoOpOperationCallbackFactory.INSTANCE);
JobHelper.createPartitionConstraint(spec, secondaryInsert, splitNCs);
// end the insert pipeline at this sink operator
NullSinkOperatorDescriptor nullSink = new NullSinkOperatorDescriptor(spec);
JobHelper.createPartitionConstraint(spec, nullSink, splitNCs);
// distribute the records from the datagen via hashing to the bulk load
// ops
IBinaryHashFunctionFactory[] hashFactories = new IBinaryHashFunctionFactory[1];
hashFactories[0] = PointableBinaryHashFunctionFactory.of(UTF8StringPointable.FACTORY);
IConnectorDescriptor hashConn = new MToNPartitioningConnectorDescriptor(spec, new FieldHashPartitionComputerFactory(new int[] { 0 }, hashFactories));
// connect the ops
spec.connect(hashConn, dataGen, 0, primaryInsert, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), primaryInsert, 0, secondaryInsert, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), secondaryInsert, 0, nullSink, 0);
spec.addRoot(nullSink);
return spec;
}
use of org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer in project asterixdb by apache.
the class PrimaryIndexBulkLoadExample method createJob.
private static JobSpecification createJob(Options options) {
JobSpecification spec = new JobSpecification(options.frameSize);
String[] splitNCs = options.ncs.split(",");
// schema of tuples to be generated: 5 fields with string, string, int,
// int, string
// we will use field-index 2 as primary key to fill a clustered index
RecordDescriptor recDesc = new RecordDescriptor(new ISerializerDeserializer[] { // this field will not go into B-Tree
new UTF8StringSerializerDeserializer(), // we will use this as payload
new UTF8StringSerializerDeserializer(), // we will use this field as key
IntegerSerializerDeserializer.INSTANCE, // we will use this as payload
IntegerSerializerDeserializer.INSTANCE, // we will use this as payload
new UTF8StringSerializerDeserializer() });
// generate numRecords records with field 2 being unique, integer values
// in [0, 100000], and strings with max length of 10 characters, and
// random seed 50
DataGenOperatorDescriptor dataGen = new DataGenOperatorDescriptor(spec, recDesc, options.numTuples, 2, 0, 100000, 10, 50);
// run data generator on first nodecontroller given
PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, dataGen, splitNCs[0]);
// sort the tuples as preparation for bulk load
// fields to sort on
int[] sortFields = { 2 };
// comparators for sort fields
IBinaryComparatorFactory[] comparatorFactories = new IBinaryComparatorFactory[1];
comparatorFactories[0] = PointableBinaryComparatorFactory.of(IntegerPointable.FACTORY);
ExternalSortOperatorDescriptor sorter = new ExternalSortOperatorDescriptor(spec, options.sbSize, sortFields, comparatorFactories, recDesc);
JobHelper.createPartitionConstraint(spec, sorter, splitNCs);
// tuples to be put into B-Tree shall have 4 fields
int fieldCount = 4;
ITypeTraits[] typeTraits = new ITypeTraits[fieldCount];
typeTraits[0] = IntegerPointable.TYPE_TRAITS;
typeTraits[1] = UTF8StringPointable.TYPE_TRAITS;
typeTraits[2] = IntegerPointable.TYPE_TRAITS;
typeTraits[3] = UTF8StringPointable.TYPE_TRAITS;
// create providers for B-Tree
IStorageManager storageManager = BTreeHelperStorageManager.INSTANCE;
// the B-Tree expects its keyfields to be at the front of its input
// tuple
// map field 2 of input tuple
int[] fieldPermutation = { 2, 1, 3, 4 };
// to field 0 of B-Tree tuple,
// etc.
IFileSplitProvider btreeSplitProvider = JobHelper.createFileSplitProvider(splitNCs, options.btreeName);
IIndexDataflowHelperFactory dataflowHelperFactory = new IndexDataflowHelperFactory(storageManager, btreeSplitProvider);
TreeIndexBulkLoadOperatorDescriptor btreeBulkLoad = new TreeIndexBulkLoadOperatorDescriptor(spec, recDesc, fieldPermutation, 0.7f, false, 1000L, true, dataflowHelperFactory);
JobHelper.createPartitionConstraint(spec, btreeBulkLoad, splitNCs);
// distribute the records from the datagen via hashing to the bulk load
// ops
IBinaryHashFunctionFactory[] hashFactories = new IBinaryHashFunctionFactory[1];
hashFactories[0] = PointableBinaryHashFunctionFactory.of(UTF8StringPointable.FACTORY);
IConnectorDescriptor hashConn = new MToNPartitioningConnectorDescriptor(spec, new FieldHashPartitionComputerFactory(new int[] { 0 }, hashFactories));
NullSinkOperatorDescriptor nsOpDesc = new NullSinkOperatorDescriptor(spec);
JobHelper.createPartitionConstraint(spec, nsOpDesc, splitNCs);
spec.connect(hashConn, dataGen, 0, sorter, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), sorter, 0, btreeBulkLoad, 0);
spec.connect(new OneToOneConnectorDescriptor(spec), btreeBulkLoad, 0, nsOpDesc, 0);
spec.addRoot(nsOpDesc);
return spec;
}
Aggregations