use of org.apache.pig.backend.executionengine.ExecException in project sketches-pig by DataSketches.
the class VarOptCommonAlgebraicTest method rawTuplesToSketchTupleExec.
// exec: sketches generally in sampling mode
@Test
public void rawTuplesToSketchTupleExec() {
final int k = 5;
final int wtIdx = 1;
final VarOptCommonImpl.RawTuplesToSketchTuple udf;
udf = new VarOptCommonImpl.RawTuplesToSketchTuple(Integer.toString(k), Integer.toString(wtIdx));
char id = 'a';
double wt = 1.0;
final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
try {
for (int i = 0; i < k + 1; ++i) {
final Tuple t = TupleFactory.getInstance().newTuple(2);
t.set(0, Character.toString(id));
t.set(1, wt);
inputBag.add(t);
++id;
wt += 1.0;
}
} catch (final ExecException e) {
fail("Unexpected ExecException creating input data");
}
try {
// degenerate input first
Tuple result = udf.exec(null);
assertNull(result);
Tuple inputTuple = TupleFactory.getInstance().newTuple(0);
result = udf.exec(inputTuple);
assertNull(result);
inputTuple = TupleFactory.getInstance().newTuple(1);
inputTuple.set(0, null);
result = udf.exec(inputTuple);
assertNull(result);
// now test real input
inputTuple.set(0, inputBag);
result = udf.exec(inputTuple);
assertEquals(result.size(), 1);
final DataByteArray dba = (DataByteArray) result.get(0);
final VarOptItemsSketch<Tuple> vis;
vis = VarOptItemsSketch.heapify(Memory.wrap(dba.get()), serDe_);
assertEquals(vis.getN(), k + 1);
assertEquals(vis.getK(), k);
// just validating the original weights are within the expected range
for (VarOptItemsSamples<Tuple>.WeightedSample ws : vis.getSketchSamples()) {
final Tuple t = ws.getItem();
assertTrue((double) t.get(wtIdx) >= 1.0);
assertTrue((double) t.get(wtIdx) <= (k + 1.0));
}
} catch (final IOException e) {
fail("Unexpected IOException calling exec()");
}
}
use of org.apache.pig.backend.executionengine.ExecException in project sketches-pig by DataSketches.
the class VarOptCommonImpl method createDataBagFromSketch.
// Produces a DataBag containing the samples from the input sketch
static DataBag createDataBagFromSketch(final VarOptItemsSketch<Tuple> sketch) {
final DataBag output = BAG_FACTORY.newDefaultBag();
final VarOptItemsSamples<Tuple> samples = sketch.getSketchSamples();
try {
// create (weight, item) tuples to add to output bag
for (final VarOptItemsSamples<Tuple>.WeightedSample ws : samples) {
final Tuple weightedSample = TUPLE_FACTORY.newTuple(2);
weightedSample.set(0, ws.getWeight());
weightedSample.set(1, ws.getItem());
output.add(weightedSample);
}
} catch (final ExecException e) {
throw new RuntimeException("Pig error: " + e.getMessage(), e);
}
return output;
}
use of org.apache.pig.backend.executionengine.ExecException in project shifu by ShifuML.
the class NNParquetWorker method load.
@Override
public void load(GuaguaWritableAdapter<LongWritable> currentKey, GuaguaWritableAdapter<Tuple> currentValue, WorkerContext<NNParams, NNParams> workerContext) {
// init field list for later read
this.initFieldList();
LOG.info("subFeatureSet size: {} ; subFeatureSet: {}", subFeatureSet.size(), subFeatureSet);
super.count += 1;
if ((super.count) % 5000 == 0) {
LOG.info("Read {} records.", super.count);
}
float[] inputs = new float[super.featureInputsCnt];
float[] ideal = new float[super.outputNodeCount];
if (super.isDry) {
// dry train, use empty data.
addDataPairToDataSet(0, new BasicFloatMLDataPair(new BasicFloatMLData(inputs), new BasicFloatMLData(ideal)));
return;
}
long hashcode = 0;
float significance = 1f;
// use guava Splitter to iterate only once
// use NNConstants.NN_DEFAULT_COLUMN_SEPARATOR to replace getModelConfig().getDataSetDelimiter(), super follows
// the function in akka mode.
int index = 0, inputsIndex = 0, outputIndex = 0;
Tuple tuple = currentValue.getWritable();
// back from foreach to for loop because of in earlier version, tuple cannot be iterable.
for (int i = 0; i < tuple.size(); i++) {
Object element = null;
try {
element = tuple.get(i);
} catch (ExecException e) {
throw new GuaguaRuntimeException(e);
}
float floatValue = 0f;
if (element != null) {
if (element instanceof Float) {
floatValue = (Float) element;
} else {
// check here to avoid bad performance in failed NumberFormatUtils.getFloat(input, 0f)
floatValue = element.toString().length() == 0 ? 0f : NumberFormatUtils.getFloat(element.toString(), 0f);
}
}
// no idea about why NaN in input data, we should process it as missing value TODO , according to norm type
floatValue = (Float.isNaN(floatValue) || Double.isNaN(floatValue)) ? 0f : floatValue;
if (index == (super.inputNodeCount + super.outputNodeCount)) {
// weight, how to process???
if (StringUtils.isBlank(modelConfig.getWeightColumnName())) {
significance = 1f;
// break here if we reach weight column which is last column
break;
}
assert element != null;
if (element != null && element instanceof Float) {
significance = (Float) element;
} else {
// check here to avoid bad performance in failed NumberFormatUtils.getFloat(input, 0f)
significance = element.toString().length() == 0 ? 1f : NumberFormatUtils.getFloat(element.toString(), 1f);
}
// if invalid weight, set it to 1f and warning in log
if (Float.compare(significance, 0f) < 0) {
LOG.warn("The {} record in current worker weight {} is less than 0f, it is invalid, set it to 1.", count, significance);
significance = 1f;
}
// break here if we reach weight column which is last column
break;
} else {
int columnIndex = requiredFieldList.getFields().get(index).getIndex();
if (columnIndex >= super.columnConfigList.size()) {
assert element != null;
if (element != null && element instanceof Float) {
significance = (Float) element;
} else {
// check here to avoid bad performance in failed NumberFormatUtils.getFloat(input, 0f)
significance = element.toString().length() == 0 ? 1f : NumberFormatUtils.getFloat(element.toString(), 1f);
}
break;
} else {
ColumnConfig columnConfig = super.columnConfigList.get(columnIndex);
if (columnConfig != null && columnConfig.isTarget()) {
if (modelConfig.isRegression()) {
ideal[outputIndex++] = floatValue;
} else {
if (modelConfig.getTrain().isOneVsAll()) {
// if one vs all, set correlated idea value according to trainerId which means in
// trainer with id 0, target 0 is treated with 1, other are 0. Such target value are set
// to index of tags like [0, 1, 2, 3] compared with ["a", "b", "c", "d"]
ideal[outputIndex++] = Float.compare(floatValue, trainerId) == 0 ? 1f : 0f;
} else {
if (modelConfig.getTags().size() == 2) {
// if only 2 classes, output node is 1 node. if target = 0 means 0 is the index for
// positive prediction, set positive to 1 and negative to 0
int ideaIndex = (int) floatValue;
ideal[0] = ideaIndex == 0 ? 1f : 0f;
} else {
// for multiple classification
int ideaIndex = (int) floatValue;
ideal[ideaIndex] = 1f;
}
}
}
} else {
if (subFeatureSet.contains(columnIndex)) {
inputs[inputsIndex++] = floatValue;
hashcode = hashcode * 31 + Double.valueOf(floatValue).hashCode();
}
}
}
}
index += 1;
}
// is helped to quick find such issue.
if (inputsIndex != inputs.length) {
String delimiter = workerContext.getProps().getProperty(Constants.SHIFU_OUTPUT_DATA_DELIMITER, Constants.DEFAULT_DELIMITER);
throw new RuntimeException("Input length is inconsistent with parsing size. Input original size: " + inputs.length + ", parsing size:" + inputsIndex + ", delimiter:" + delimiter + ".");
}
// sample negative only logic here
if (modelConfig.getTrain().getSampleNegOnly()) {
if (this.modelConfig.isFixInitialInput()) {
// if fixInitialInput, sample hashcode in 1-sampleRate range out if negative records
int startHashCode = (100 / this.modelConfig.getBaggingNum()) * this.trainerId;
// here BaggingSampleRate means how many data will be used in training and validation, if it is 0.8, we
// should take 1-0.8 to check endHashCode
int endHashCode = startHashCode + Double.valueOf((1d - this.modelConfig.getBaggingSampleRate()) * 100).intValue();
if ((modelConfig.isRegression() || // regression or
(modelConfig.isClassification() && modelConfig.getTrain().isOneVsAll())) && // onevsall
(int) (ideal[0] + 0.01d) == // negative record
0 && isInRange(hashcode, startHashCode, endHashCode)) {
return;
}
} else {
// if negative record
if ((modelConfig.isRegression() || // regression or
(modelConfig.isClassification() && modelConfig.getTrain().isOneVsAll())) && // onevsall
(int) (ideal[0] + 0.01d) == // negative record
0 && Double.compare(super.sampelNegOnlyRandom.nextDouble(), this.modelConfig.getBaggingSampleRate()) >= 0) {
return;
}
}
}
FloatMLDataPair pair = new BasicFloatMLDataPair(new BasicFloatMLData(inputs), new BasicFloatMLData(ideal));
// up sampling logic
if (modelConfig.isRegression() && isUpSampleEnabled() && Double.compare(ideal[0], 1d) == 0) {
// Double.compare(ideal[0], 1d) == 0 means positive tags; sample + 1 to avoid sample count to 0
pair.setSignificance(significance * (super.upSampleRng.sample() + 1));
} else {
pair.setSignificance(significance);
}
boolean isValidation = false;
if (workerContext.getAttachment() != null && workerContext.getAttachment() instanceof Boolean) {
isValidation = (Boolean) workerContext.getAttachment();
}
boolean isInTraining = addDataPairToDataSet(hashcode, pair, isValidation);
// do bagging sampling only for training data
if (isInTraining) {
float subsampleWeights = sampleWeights(pair.getIdealArray()[0]);
if (isPositive(pair.getIdealArray()[0])) {
this.positiveSelectedTrainCount += subsampleWeights * 1L;
} else {
this.negativeSelectedTrainCount += subsampleWeights * 1L;
}
// set weights to significance, if 0, significance will be 0, that is bagging sampling
pair.setSignificance(pair.getSignificance() * subsampleWeights);
} else {
// for validation data, according bagging sampling logic, we may need to sampling validation data set, while
// validation data set are only used to compute validation error, not to do real sampling is ok.
}
}
use of org.apache.pig.backend.executionengine.ExecException in project shifu by ShifuML.
the class NumericalVarStats method statsNumericalColumnInfo.
/**
* @param databag
* @param columnConfig
* @throws ExecException
*/
private void statsNumericalColumnInfo(DataBag databag, ColumnConfig columnConfig) throws ExecException {
// The last bin is for missingOrInvalid values
Integer[] binCountPos = new Integer[columnConfig.getBinBoundary().size() + 1];
Integer[] binCountNeg = new Integer[columnConfig.getBinBoundary().size() + 1];
Double[] binWeightCountPos = new Double[columnConfig.getBinBoundary().size() + 1];
Double[] binWeightCountNeg = new Double[columnConfig.getBinBoundary().size() + 1];
int lastBinIndex = columnConfig.getBinBoundary().size();
initializeZeroArr(binCountPos);
initializeZeroArr(binCountNeg);
initializeZeroArr(binWeightCountPos);
initializeZeroArr(binWeightCountNeg);
boolean isMissingValue = false;
boolean isInvalidValue = false;
Iterator<Tuple> iterator = databag.iterator();
while (iterator.hasNext()) {
isMissingValue = false;
isInvalidValue = false;
Tuple element = iterator.next();
if (element.size() < 4) {
continue;
}
Object value = element.get(1);
String tag = CommonUtils.trimTag((String) element.get(2));
Double weight = (Double) element.get(3);
double colVal = 0.0;
String str = null;
if (value == null || StringUtils.isBlank(value.toString())) {
// TODO check missing value list in ModelConfig??
missingValueCnt++;
isMissingValue = true;
} else {
str = StringUtils.trim(value.toString());
try {
colVal = Double.parseDouble(str);
} catch (Exception e) {
invalidValueCnt++;
isInvalidValue = true;
}
}
if (isInvalidValue || isMissingValue) {
if (modelConfig.getPosTags().contains(tag)) {
increaseInstCnt(binCountPos, lastBinIndex);
increaseInstCnt(binWeightCountPos, lastBinIndex, weight);
} else if (modelConfig.getNegTags().contains(tag)) {
increaseInstCnt(binCountNeg, lastBinIndex);
increaseInstCnt(binWeightCountNeg, lastBinIndex, weight);
}
} else {
streamStatsCalculator.addData(colVal);
// binning.addData(colVal);
int binNum = BinUtils.getBinNum(columnConfig, str);
if (binNum == -1) {
throw new RuntimeException("binNum should not be -1 to this step.");
}
if (modelConfig.getPosTags().contains(tag)) {
increaseInstCnt(binCountPos, binNum);
increaseInstCnt(binWeightCountPos, binNum, weight);
} else if (modelConfig.getNegTags().contains(tag)) {
increaseInstCnt(binCountNeg, binNum);
increaseInstCnt(binWeightCountNeg, binNum, weight);
}
}
}
columnConfig.setBinCountPos(Arrays.asList(binCountPos));
columnConfig.setBinCountNeg(Arrays.asList(binCountNeg));
columnConfig.setBinWeightedPos(Arrays.asList(binWeightCountPos));
columnConfig.setBinWeightedNeg(Arrays.asList(binWeightCountNeg));
columnConfig.setMax(streamStatsCalculator.getMax());
columnConfig.setMean(streamStatsCalculator.getMean());
columnConfig.setMin(streamStatsCalculator.getMin());
columnConfig.setMedian(streamStatsCalculator.getMedian());
columnConfig.setStdDev(streamStatsCalculator.getStdDev());
// Currently, invalid value will be regarded as missing
columnConfig.setMissingCnt(missingValueCnt + invalidValueCnt);
columnConfig.setTotalCount(databag.size());
columnConfig.setMissingPercentage(((double) columnConfig.getMissingCount()) / columnConfig.getTotalCount());
columnConfig.getColumnStats().setSkewness(streamStatsCalculator.getSkewness());
columnConfig.getColumnStats().setKurtosis(streamStatsCalculator.getKurtosis());
calculateBinPosRateAndAvgScore();
}
use of org.apache.pig.backend.executionengine.ExecException in project elephant-bird by twitter.
the class PigToProtobuf method tupleToMessage.
/**
* @param builder
* @param fieldDescriptors should be same as builder.getDescriptorForType.getFields().
* Avoids overhead of getFields() which creates an array each time.
* @param tuple
* @return
*/
public static Message tupleToMessage(Builder builder, List<FieldDescriptor> fieldDescriptors, Tuple tuple) {
if (tuple == null) {
return builder.build();
}
for (int i = 0; i < fieldDescriptors.size() && i < tuple.size(); i++) {
Object tupleField = null;
FieldDescriptor fieldDescriptor = fieldDescriptors.get(i);
try {
tupleField = tuple.get(i);
} catch (ExecException e) {
LOG.warn("Could not convert tuple field " + tupleField + " to field with descriptor " + fieldDescriptor);
continue;
}
if (tupleField != null) {
try {
if (fieldDescriptor.isRepeated()) {
// Repeated fields are set with Lists containing objects of the fields' Java type.
builder.setField(fieldDescriptor, dataBagToRepeatedField(builder, fieldDescriptor, (DataBag) tupleField));
} else {
if (fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE) {
Builder nestedMessageBuilder = builder.newBuilderForField(fieldDescriptor);
builder.setField(fieldDescriptor, tupleToMessage(nestedMessageBuilder, (Tuple) tupleField));
} else {
builder.setField(fieldDescriptor, tupleFieldToSingleField(fieldDescriptor, tupleField));
}
}
} catch (Exception e) {
String value = String.valueOf(tupleField);
final int max_length = 100;
if (max_length < value.length()) {
value = value.substring(0, max_length - 3) + "...";
}
String type = tupleField == null ? "unknown" : tupleField.getClass().getName();
throw new RuntimeException(String.format("Failed to set field '%s' using tuple value '%s' of type '%s' at index %d", fieldDescriptor.getName(), value, type, i), e);
}
}
}
return builder.build();
}
Aggregations