Search in sources :

Example 31 with ExecException

use of org.apache.pig.backend.executionengine.ExecException in project sketches-pig by DataSketches.

the class VarOptCommonAlgebraicTest method rawTuplesToSketchTupleExec.

// exec: sketches generally in sampling mode
@Test
public void rawTuplesToSketchTupleExec() {
    final int k = 5;
    final int wtIdx = 1;
    final VarOptCommonImpl.RawTuplesToSketchTuple udf;
    udf = new VarOptCommonImpl.RawTuplesToSketchTuple(Integer.toString(k), Integer.toString(wtIdx));
    char id = 'a';
    double wt = 1.0;
    final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
    try {
        for (int i = 0; i < k + 1; ++i) {
            final Tuple t = TupleFactory.getInstance().newTuple(2);
            t.set(0, Character.toString(id));
            t.set(1, wt);
            inputBag.add(t);
            ++id;
            wt += 1.0;
        }
    } catch (final ExecException e) {
        fail("Unexpected ExecException creating input data");
    }
    try {
        // degenerate input first
        Tuple result = udf.exec(null);
        assertNull(result);
        Tuple inputTuple = TupleFactory.getInstance().newTuple(0);
        result = udf.exec(inputTuple);
        assertNull(result);
        inputTuple = TupleFactory.getInstance().newTuple(1);
        inputTuple.set(0, null);
        result = udf.exec(inputTuple);
        assertNull(result);
        // now test real input
        inputTuple.set(0, inputBag);
        result = udf.exec(inputTuple);
        assertEquals(result.size(), 1);
        final DataByteArray dba = (DataByteArray) result.get(0);
        final VarOptItemsSketch<Tuple> vis;
        vis = VarOptItemsSketch.heapify(Memory.wrap(dba.get()), serDe_);
        assertEquals(vis.getN(), k + 1);
        assertEquals(vis.getK(), k);
        // just validating the original weights are within the expected range
        for (VarOptItemsSamples<Tuple>.WeightedSample ws : vis.getSketchSamples()) {
            final Tuple t = ws.getItem();
            assertTrue((double) t.get(wtIdx) >= 1.0);
            assertTrue((double) t.get(wtIdx) <= (k + 1.0));
        }
    } catch (final IOException e) {
        fail("Unexpected IOException calling exec()");
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) ExecException(org.apache.pig.backend.executionengine.ExecException) VarOptItemsSamples(com.yahoo.sketches.sampling.VarOptItemsSamples) IOException(java.io.IOException) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 32 with ExecException

use of org.apache.pig.backend.executionengine.ExecException in project sketches-pig by DataSketches.

the class VarOptCommonImpl method createDataBagFromSketch.

// Produces a DataBag containing the samples from the input sketch
static DataBag createDataBagFromSketch(final VarOptItemsSketch<Tuple> sketch) {
    final DataBag output = BAG_FACTORY.newDefaultBag();
    final VarOptItemsSamples<Tuple> samples = sketch.getSketchSamples();
    try {
        // create (weight, item) tuples to add to output bag
        for (final VarOptItemsSamples<Tuple>.WeightedSample ws : samples) {
            final Tuple weightedSample = TUPLE_FACTORY.newTuple(2);
            weightedSample.set(0, ws.getWeight());
            weightedSample.set(1, ws.getItem());
            output.add(weightedSample);
        }
    } catch (final ExecException e) {
        throw new RuntimeException("Pig error: " + e.getMessage(), e);
    }
    return output;
}
Also used : DataBag(org.apache.pig.data.DataBag) ExecException(org.apache.pig.backend.executionengine.ExecException) VarOptItemsSamples(com.yahoo.sketches.sampling.VarOptItemsSamples) Tuple(org.apache.pig.data.Tuple)

Example 33 with ExecException

use of org.apache.pig.backend.executionengine.ExecException in project shifu by ShifuML.

the class NNParquetWorker method load.

@Override
public void load(GuaguaWritableAdapter<LongWritable> currentKey, GuaguaWritableAdapter<Tuple> currentValue, WorkerContext<NNParams, NNParams> workerContext) {
    // init field list for later read
    this.initFieldList();
    LOG.info("subFeatureSet size: {} ; subFeatureSet: {}", subFeatureSet.size(), subFeatureSet);
    super.count += 1;
    if ((super.count) % 5000 == 0) {
        LOG.info("Read {} records.", super.count);
    }
    float[] inputs = new float[super.featureInputsCnt];
    float[] ideal = new float[super.outputNodeCount];
    if (super.isDry) {
        // dry train, use empty data.
        addDataPairToDataSet(0, new BasicFloatMLDataPair(new BasicFloatMLData(inputs), new BasicFloatMLData(ideal)));
        return;
    }
    long hashcode = 0;
    float significance = 1f;
    // use guava Splitter to iterate only once
    // use NNConstants.NN_DEFAULT_COLUMN_SEPARATOR to replace getModelConfig().getDataSetDelimiter(), super follows
    // the function in akka mode.
    int index = 0, inputsIndex = 0, outputIndex = 0;
    Tuple tuple = currentValue.getWritable();
    // back from foreach to for loop because of in earlier version, tuple cannot be iterable.
    for (int i = 0; i < tuple.size(); i++) {
        Object element = null;
        try {
            element = tuple.get(i);
        } catch (ExecException e) {
            throw new GuaguaRuntimeException(e);
        }
        float floatValue = 0f;
        if (element != null) {
            if (element instanceof Float) {
                floatValue = (Float) element;
            } else {
                // check here to avoid bad performance in failed NumberFormatUtils.getFloat(input, 0f)
                floatValue = element.toString().length() == 0 ? 0f : NumberFormatUtils.getFloat(element.toString(), 0f);
            }
        }
        // no idea about why NaN in input data, we should process it as missing value TODO , according to norm type
        floatValue = (Float.isNaN(floatValue) || Double.isNaN(floatValue)) ? 0f : floatValue;
        if (index == (super.inputNodeCount + super.outputNodeCount)) {
            // weight, how to process???
            if (StringUtils.isBlank(modelConfig.getWeightColumnName())) {
                significance = 1f;
                // break here if we reach weight column which is last column
                break;
            }
            assert element != null;
            if (element != null && element instanceof Float) {
                significance = (Float) element;
            } else {
                // check here to avoid bad performance in failed NumberFormatUtils.getFloat(input, 0f)
                significance = element.toString().length() == 0 ? 1f : NumberFormatUtils.getFloat(element.toString(), 1f);
            }
            // if invalid weight, set it to 1f and warning in log
            if (Float.compare(significance, 0f) < 0) {
                LOG.warn("The {} record in current worker weight {} is less than 0f, it is invalid, set it to 1.", count, significance);
                significance = 1f;
            }
            // break here if we reach weight column which is last column
            break;
        } else {
            int columnIndex = requiredFieldList.getFields().get(index).getIndex();
            if (columnIndex >= super.columnConfigList.size()) {
                assert element != null;
                if (element != null && element instanceof Float) {
                    significance = (Float) element;
                } else {
                    // check here to avoid bad performance in failed NumberFormatUtils.getFloat(input, 0f)
                    significance = element.toString().length() == 0 ? 1f : NumberFormatUtils.getFloat(element.toString(), 1f);
                }
                break;
            } else {
                ColumnConfig columnConfig = super.columnConfigList.get(columnIndex);
                if (columnConfig != null && columnConfig.isTarget()) {
                    if (modelConfig.isRegression()) {
                        ideal[outputIndex++] = floatValue;
                    } else {
                        if (modelConfig.getTrain().isOneVsAll()) {
                            // if one vs all, set correlated idea value according to trainerId which means in
                            // trainer with id 0, target 0 is treated with 1, other are 0. Such target value are set
                            // to index of tags like [0, 1, 2, 3] compared with ["a", "b", "c", "d"]
                            ideal[outputIndex++] = Float.compare(floatValue, trainerId) == 0 ? 1f : 0f;
                        } else {
                            if (modelConfig.getTags().size() == 2) {
                                // if only 2 classes, output node is 1 node. if target = 0 means 0 is the index for
                                // positive prediction, set positive to 1 and negative to 0
                                int ideaIndex = (int) floatValue;
                                ideal[0] = ideaIndex == 0 ? 1f : 0f;
                            } else {
                                // for multiple classification
                                int ideaIndex = (int) floatValue;
                                ideal[ideaIndex] = 1f;
                            }
                        }
                    }
                } else {
                    if (subFeatureSet.contains(columnIndex)) {
                        inputs[inputsIndex++] = floatValue;
                        hashcode = hashcode * 31 + Double.valueOf(floatValue).hashCode();
                    }
                }
            }
        }
        index += 1;
    }
    // is helped to quick find such issue.
    if (inputsIndex != inputs.length) {
        String delimiter = workerContext.getProps().getProperty(Constants.SHIFU_OUTPUT_DATA_DELIMITER, Constants.DEFAULT_DELIMITER);
        throw new RuntimeException("Input length is inconsistent with parsing size. Input original size: " + inputs.length + ", parsing size:" + inputsIndex + ", delimiter:" + delimiter + ".");
    }
    // sample negative only logic here
    if (modelConfig.getTrain().getSampleNegOnly()) {
        if (this.modelConfig.isFixInitialInput()) {
            // if fixInitialInput, sample hashcode in 1-sampleRate range out if negative records
            int startHashCode = (100 / this.modelConfig.getBaggingNum()) * this.trainerId;
            // here BaggingSampleRate means how many data will be used in training and validation, if it is 0.8, we
            // should take 1-0.8 to check endHashCode
            int endHashCode = startHashCode + Double.valueOf((1d - this.modelConfig.getBaggingSampleRate()) * 100).intValue();
            if ((modelConfig.isRegression() || // regression or
            (modelConfig.isClassification() && modelConfig.getTrain().isOneVsAll())) && // onevsall
            (int) (ideal[0] + 0.01d) == // negative record
            0 && isInRange(hashcode, startHashCode, endHashCode)) {
                return;
            }
        } else {
            // if negative record
            if ((modelConfig.isRegression() || // regression or
            (modelConfig.isClassification() && modelConfig.getTrain().isOneVsAll())) && // onevsall
            (int) (ideal[0] + 0.01d) == // negative record
            0 && Double.compare(super.sampelNegOnlyRandom.nextDouble(), this.modelConfig.getBaggingSampleRate()) >= 0) {
                return;
            }
        }
    }
    FloatMLDataPair pair = new BasicFloatMLDataPair(new BasicFloatMLData(inputs), new BasicFloatMLData(ideal));
    // up sampling logic
    if (modelConfig.isRegression() && isUpSampleEnabled() && Double.compare(ideal[0], 1d) == 0) {
        // Double.compare(ideal[0], 1d) == 0 means positive tags; sample + 1 to avoid sample count to 0
        pair.setSignificance(significance * (super.upSampleRng.sample() + 1));
    } else {
        pair.setSignificance(significance);
    }
    boolean isValidation = false;
    if (workerContext.getAttachment() != null && workerContext.getAttachment() instanceof Boolean) {
        isValidation = (Boolean) workerContext.getAttachment();
    }
    boolean isInTraining = addDataPairToDataSet(hashcode, pair, isValidation);
    // do bagging sampling only for training data
    if (isInTraining) {
        float subsampleWeights = sampleWeights(pair.getIdealArray()[0]);
        if (isPositive(pair.getIdealArray()[0])) {
            this.positiveSelectedTrainCount += subsampleWeights * 1L;
        } else {
            this.negativeSelectedTrainCount += subsampleWeights * 1L;
        }
        // set weights to significance, if 0, significance will be 0, that is bagging sampling
        pair.setSignificance(pair.getSignificance() * subsampleWeights);
    } else {
    // for validation data, according bagging sampling logic, we may need to sampling validation data set, while
    // validation data set are only used to compute validation error, not to do real sampling is ok.
    }
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) ExecException(org.apache.pig.backend.executionengine.ExecException) BasicFloatMLDataPair(ml.shifu.shifu.core.dtrain.dataset.BasicFloatMLDataPair) FloatMLDataPair(ml.shifu.shifu.core.dtrain.dataset.FloatMLDataPair) BasicFloatMLDataPair(ml.shifu.shifu.core.dtrain.dataset.BasicFloatMLDataPair) BasicFloatMLData(ml.shifu.shifu.core.dtrain.dataset.BasicFloatMLData) GuaguaRuntimeException(ml.shifu.guagua.GuaguaRuntimeException) GuaguaRuntimeException(ml.shifu.guagua.GuaguaRuntimeException) Tuple(org.apache.pig.data.Tuple)

Example 34 with ExecException

use of org.apache.pig.backend.executionengine.ExecException in project shifu by ShifuML.

the class NumericalVarStats method statsNumericalColumnInfo.

/**
 * @param databag
 * @param columnConfig
 * @throws ExecException
 */
private void statsNumericalColumnInfo(DataBag databag, ColumnConfig columnConfig) throws ExecException {
    // The last bin is for missingOrInvalid values
    Integer[] binCountPos = new Integer[columnConfig.getBinBoundary().size() + 1];
    Integer[] binCountNeg = new Integer[columnConfig.getBinBoundary().size() + 1];
    Double[] binWeightCountPos = new Double[columnConfig.getBinBoundary().size() + 1];
    Double[] binWeightCountNeg = new Double[columnConfig.getBinBoundary().size() + 1];
    int lastBinIndex = columnConfig.getBinBoundary().size();
    initializeZeroArr(binCountPos);
    initializeZeroArr(binCountNeg);
    initializeZeroArr(binWeightCountPos);
    initializeZeroArr(binWeightCountNeg);
    boolean isMissingValue = false;
    boolean isInvalidValue = false;
    Iterator<Tuple> iterator = databag.iterator();
    while (iterator.hasNext()) {
        isMissingValue = false;
        isInvalidValue = false;
        Tuple element = iterator.next();
        if (element.size() < 4) {
            continue;
        }
        Object value = element.get(1);
        String tag = CommonUtils.trimTag((String) element.get(2));
        Double weight = (Double) element.get(3);
        double colVal = 0.0;
        String str = null;
        if (value == null || StringUtils.isBlank(value.toString())) {
            // TODO check missing value list in ModelConfig??
            missingValueCnt++;
            isMissingValue = true;
        } else {
            str = StringUtils.trim(value.toString());
            try {
                colVal = Double.parseDouble(str);
            } catch (Exception e) {
                invalidValueCnt++;
                isInvalidValue = true;
            }
        }
        if (isInvalidValue || isMissingValue) {
            if (modelConfig.getPosTags().contains(tag)) {
                increaseInstCnt(binCountPos, lastBinIndex);
                increaseInstCnt(binWeightCountPos, lastBinIndex, weight);
            } else if (modelConfig.getNegTags().contains(tag)) {
                increaseInstCnt(binCountNeg, lastBinIndex);
                increaseInstCnt(binWeightCountNeg, lastBinIndex, weight);
            }
        } else {
            streamStatsCalculator.addData(colVal);
            // binning.addData(colVal);
            int binNum = BinUtils.getBinNum(columnConfig, str);
            if (binNum == -1) {
                throw new RuntimeException("binNum should not be -1 to this step.");
            }
            if (modelConfig.getPosTags().contains(tag)) {
                increaseInstCnt(binCountPos, binNum);
                increaseInstCnt(binWeightCountPos, binNum, weight);
            } else if (modelConfig.getNegTags().contains(tag)) {
                increaseInstCnt(binCountNeg, binNum);
                increaseInstCnt(binWeightCountNeg, binNum, weight);
            }
        }
    }
    columnConfig.setBinCountPos(Arrays.asList(binCountPos));
    columnConfig.setBinCountNeg(Arrays.asList(binCountNeg));
    columnConfig.setBinWeightedPos(Arrays.asList(binWeightCountPos));
    columnConfig.setBinWeightedNeg(Arrays.asList(binWeightCountNeg));
    columnConfig.setMax(streamStatsCalculator.getMax());
    columnConfig.setMean(streamStatsCalculator.getMean());
    columnConfig.setMin(streamStatsCalculator.getMin());
    columnConfig.setMedian(streamStatsCalculator.getMedian());
    columnConfig.setStdDev(streamStatsCalculator.getStdDev());
    // Currently, invalid value will be regarded as missing
    columnConfig.setMissingCnt(missingValueCnt + invalidValueCnt);
    columnConfig.setTotalCount(databag.size());
    columnConfig.setMissingPercentage(((double) columnConfig.getMissingCount()) / columnConfig.getTotalCount());
    columnConfig.getColumnStats().setSkewness(streamStatsCalculator.getSkewness());
    columnConfig.getColumnStats().setKurtosis(streamStatsCalculator.getKurtosis());
    calculateBinPosRateAndAvgScore();
}
Also used : ExecException(org.apache.pig.backend.executionengine.ExecException) Tuple(org.apache.pig.data.Tuple)

Example 35 with ExecException

use of org.apache.pig.backend.executionengine.ExecException in project elephant-bird by twitter.

the class PigToProtobuf method tupleToMessage.

/**
 * @param builder
 * @param fieldDescriptors should be same as builder.getDescriptorForType.getFields().
 *        Avoids overhead of getFields() which creates an array each time.
 * @param tuple
 * @return
 */
public static Message tupleToMessage(Builder builder, List<FieldDescriptor> fieldDescriptors, Tuple tuple) {
    if (tuple == null) {
        return builder.build();
    }
    for (int i = 0; i < fieldDescriptors.size() && i < tuple.size(); i++) {
        Object tupleField = null;
        FieldDescriptor fieldDescriptor = fieldDescriptors.get(i);
        try {
            tupleField = tuple.get(i);
        } catch (ExecException e) {
            LOG.warn("Could not convert tuple field " + tupleField + " to field with descriptor " + fieldDescriptor);
            continue;
        }
        if (tupleField != null) {
            try {
                if (fieldDescriptor.isRepeated()) {
                    // Repeated fields are set with Lists containing objects of the fields' Java type.
                    builder.setField(fieldDescriptor, dataBagToRepeatedField(builder, fieldDescriptor, (DataBag) tupleField));
                } else {
                    if (fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE) {
                        Builder nestedMessageBuilder = builder.newBuilderForField(fieldDescriptor);
                        builder.setField(fieldDescriptor, tupleToMessage(nestedMessageBuilder, (Tuple) tupleField));
                    } else {
                        builder.setField(fieldDescriptor, tupleFieldToSingleField(fieldDescriptor, tupleField));
                    }
                }
            } catch (Exception e) {
                String value = String.valueOf(tupleField);
                final int max_length = 100;
                if (max_length < value.length()) {
                    value = value.substring(0, max_length - 3) + "...";
                }
                String type = tupleField == null ? "unknown" : tupleField.getClass().getName();
                throw new RuntimeException(String.format("Failed to set field '%s' using tuple value '%s' of type '%s' at index %d", fieldDescriptor.getName(), value, type, i), e);
            }
        }
    }
    return builder.build();
}
Also used : DataBag(org.apache.pig.data.DataBag) ExecException(org.apache.pig.backend.executionengine.ExecException) Builder(com.google.protobuf.Message.Builder) ByteString(com.google.protobuf.ByteString) Tuple(org.apache.pig.data.Tuple) DescriptorValidationException(com.google.protobuf.Descriptors.DescriptorValidationException) ExecException(org.apache.pig.backend.executionengine.ExecException) FieldDescriptor(com.google.protobuf.Descriptors.FieldDescriptor)

Aggregations

ExecException (org.apache.pig.backend.executionengine.ExecException)57 Tuple (org.apache.pig.data.Tuple)32 DataBag (org.apache.pig.data.DataBag)17 OGCGeometry (com.esri.core.geometry.ogc.OGCGeometry)13 DataByteArray (org.apache.pig.data.DataByteArray)11 IOException (java.io.IOException)9 Geometry (org.locationtech.jts.geom.Geometry)9 Test (org.testng.annotations.Test)6 Coordinate (org.locationtech.jts.geom.Coordinate)4 Field (com.twitter.elephantbird.thrift.TStructDescriptor.Field)3 FieldDescriptor (com.google.protobuf.Descriptors.FieldDescriptor)2 Builder (com.google.protobuf.Message.Builder)2 VarOptItemsSamples (com.yahoo.sketches.sampling.VarOptItemsSamples)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 VarOptItemsSamples (org.apache.datasketches.sampling.VarOptItemsSamples)2 Line (com.esri.core.geometry.Line)1 MultiPath (com.esri.core.geometry.MultiPath)1 Point (com.esri.core.geometry.Point)1 Polygon (com.esri.core.geometry.Polygon)1