use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class NormalizeUDF method exec.
@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
Object tag = input.get(tagColumnNum);
if (tag == null) {
log.warn("The tag is NULL, just skip it!!");
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
final String rawTag = CommonUtils.trimTag(tag.toString());
// make sure all invalid tag record are filter out
if (!isLinearTarget && !super.tagSet.contains(rawTag)) {
if (isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1);
}
return null;
}
// if(!isLinearTarget && !this.isForClean) {
if (!isLinearTarget) {
// do data sampling. Unselected data or data with invalid tag will be filtered out.
boolean isNotSampled = DataSampler.isNotSampled(modelConfig.isRegression(), super.tagSet, super.posTagSet, super.negTagSet, modelConfig.getNormalizeSampleRate(), modelConfig.isNormalizeSampleNegOnly(), rawTag);
if (isNotSampled) {
return null;
}
}
// append tuple with tag, normalized value.
Tuple tuple = TupleFactory.getInstance().newTuple();
final NormType normType = modelConfig.getNormalizeType();
Map<String, Object> compactVarMap = null;
if (this.isCompactNorm) {
compactVarMap = new HashMap<String, Object>();
}
if (!this.isForExpressions) {
if (input.size() != this.columnConfigList.size()) {
this.mismatchCnt++;
log.error("the input size - " + input.size() + ", while column size - " + columnConfigList.size());
this.mismatchCnt++;
// this could make Shifu could skip some malformed data
if (this.mismatchCnt > MAX_MISMATCH_CNT) {
throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
}
return null;
}
for (int i = 0; i < input.size(); i++) {
ColumnConfig config = columnConfigList.get(i);
String val = (input.get(i) == null) ? "" : input.get(i).toString();
// load variables for weight calculating.
if (weightExpr != null) {
weightContext.set(new NSColumn(config.getColumnName()).getSimpleName(), val);
}
// check tag type.
if (tagColumnNum == i) {
if (modelConfig.isRegression()) {
int type = 0;
if (super.posTagSet.contains(rawTag)) {
type = 1;
} else if (super.negTagSet.contains(rawTag)) {
type = 0;
} else {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
} else {
tuple.append(type);
}
} else if (this.isLinearTarget) {
double tagValue = 0.0;
try {
tagValue = Double.parseDouble(rawTag);
} catch (Exception e) {
log.error("Tag - " + rawTag + " is invalid(not numerical). Skip record.");
// skip this line
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), tagValue);
} else {
tuple.append(tagValue);
}
} else {
int index = -1;
for (int j = 0; j < tags.size(); j++) {
Set<String> tagSet = tags.get(j);
if (tagSet.contains(rawTag)) {
index = j;
break;
}
}
if (index == -1) {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
} else {
tuple.append(index);
}
}
continue;
}
if (this.isForClean) {
// for RF/GBT model, only clean data, not real do norm data
if (config.isCategorical()) {
Map<String, Integer> map = this.categoricalIndexMap.get(config.getColumnNum());
// map should not be null, no need check if map is null, if val not in binCategory, set it to ""
tuple.append(((map.get(val) == null || map.get(val) == -1)) ? "" : val);
} else {
Double normVal = 0d;
try {
normVal = Double.parseDouble(val);
} catch (Exception e) {
log.debug("Not decimal format " + val + ", using default!");
normVal = Normalizer.defaultMissingValue(config);
}
appendOutputValue(tuple, normVal, true);
}
} else {
if (this.isCompactNorm) {
// only output features and target, weight in compact norm mode
if (!config.isMeta() && config.isFinalSelect()) {
// for multiple classification, binPosRate means rate of such category over all counts,
// reuse binPosRate for normalize
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
String formatVal = getOutputValue(normVal, true);
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
}
} else if (config.isMeta()) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
} else {
// if is compact mode but such column is not final selected, should be empty, as only append
// target and finalSelect feature, no need append here so this code block is empty. TODO, do
// we need meta column?
}
} else {
// it will cause variable fail to normalize
if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
// for multiple classification, binPosRate means rate of such category over all counts,
// reuse binPosRate for normalize
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
appendOutputValue(tuple, normVal, true);
}
} else {
tuple.append(config.isMeta() ? val : null);
}
}
}
}
} else {
// for segment expansion variables
int rawSize = input.size();
for (int i = 0; i < this.columnConfigList.size(); i++) {
ColumnConfig config = this.columnConfigList.get(i);
int newIndex = i >= rawSize ? i % rawSize : i;
String val = (input.get(newIndex) == null) ? "" : input.get(newIndex).toString();
// for target column
if (config.isTarget()) {
if (modelConfig.isRegression()) {
int type = 0;
if (super.posTagSet.contains(rawTag)) {
type = 1;
} else if (super.negTagSet.contains(rawTag)) {
type = 0;
} else {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), type);
} else {
tuple.append(type);
}
} else {
int index = -1;
for (int j = 0; j < tags.size(); j++) {
Set<String> tagSet = tags.get(j);
if (tagSet.contains(rawTag)) {
index = j;
break;
}
}
if (index == -1) {
log.error("Invalid data! The target value is not listed - " + rawTag);
warn("Invalid data! The target value is not listed - " + rawTag, WarnInNormalizeUDF.INVALID_TAG);
return null;
}
if (this.isCompactNorm) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), index);
} else {
tuple.append(index);
}
}
continue;
}
if (this.isCompactNorm) {
// only output features and target, weight in compact norm mode
if (!config.isMeta() && config.isFinalSelect()) {
// for multiple classification, binPosRate means rate of such category over all counts,
// reuse binPosRate for normalize
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
String formatVal = getOutputValue(normVal, true);
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), formatVal);
}
} else if (config.isMeta()) {
compactVarMap.put(CommonUtils.normColumnName(config.getColumnName()), val);
} else {
// if is compact mode but such column is not final selected, should be empty, as only append
// target and finalSelect feature, no need append here so this code block is empty. TODO, do
// we need meta column?
}
} else {
// for others
if (CommonUtils.isToNormVariable(config, super.hasCandidates, modelConfig.isRegression())) {
List<Double> normVals = Normalizer.fullNormalize(config, val, cutoff, normType, this.categoryMissingNormType, this.categoricalIndexMap.get(config.getColumnNum()));
for (Double normVal : normVals) {
appendOutputValue(tuple, normVal, true);
}
} else {
tuple.append(config.isMeta() ? val : null);
}
}
}
}
// for compact norm mode, output to tuple at here
if (this.isCompactNorm) {
for (int i = 0; i < outputCompactColumns.size(); i++) {
tuple.append(compactVarMap.get(outputCompactColumns.get(i)));
}
}
// append tuple with weight.
double weight = evaluateWeight(weightExpr, weightContext);
tuple.append(weight);
return tuple;
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class CommonUtils method getHeaders.
/**
* Return header column array from header file.
*
* @param pathHeader
* header path
* @param delimiter
* the delimiter of headers
* @param sourceType
* source type: hdfs or local
* @param isFull
* if full header name including name space
* @return headers array
* @throws IOException
* if any IO exception in reading file.
*
* @throws IllegalArgumentException
* if sourceType is null, if pathHeader is null or empty, if delimiter is null or empty.
*
* @throws RuntimeException
* if first line of pathHeader is null or empty.
*/
public static String[] getHeaders(String pathHeader, String delimiter, SourceType sourceType, boolean isFull) throws IOException {
if (StringUtils.isEmpty(pathHeader) || StringUtils.isEmpty(delimiter) || sourceType == null) {
throw new IllegalArgumentException(String.format("Null or empty parameters srcDataPath:%s, dstDataPath:%s, sourceType:%s", pathHeader, delimiter, sourceType));
}
BufferedReader reader = null;
String pigHeaderStr = null;
try {
reader = ShifuFileUtils.getReader(pathHeader, sourceType);
pigHeaderStr = reader.readLine();
if (StringUtils.isEmpty(pigHeaderStr)) {
throw new RuntimeException(String.format("Cannot reade header info from the first line of file: %s", pathHeader));
}
} catch (Exception e) {
log.error("Error in getReader, this must be catched in this method to make sure the next reader can be returned.", e);
throw new ShifuException(ShifuErrorCode.ERROR_HEADER_NOT_FOUND);
} finally {
IOUtils.closeQuietly(reader);
}
List<String> headerList = new ArrayList<String>();
Set<String> headerSet = new HashSet<String>();
int index = 0;
for (String str : Splitter.on(delimiter).split(pigHeaderStr)) {
String columnName = StringUtils.trimToEmpty(str);
if (!Environment.getBoolean(Constants.SHIFU_NAMESPACE_STRICT_MODE, false)) {
columnName = getRelativePigHeaderColumnName(str);
}
/*
* if(isFull) {
* columnName = getFullPigHeaderColumnName(str);
* } else {
* columnName = getRelativePigHeaderColumnName(str);
* }
*/
if (headerSet.contains(columnName)) {
columnName = columnName + "_" + index;
}
columnName = normColumnName(columnName);
headerSet.add(columnName);
index++;
headerList.add(columnName);
}
return headerList.toArray(new String[0]);
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class HDFSUtils method getFS.
/*
* Get HDFS FileSystem
*/
public static FileSystem getFS() {
if (hdfs == null) {
synchronized (HDFSUtils.class) {
if (hdfs == null) {
try {
// initialization
// Assign to the hdfs instance after the tmpHdfs instance initialization fully complete.
// Avoid hdfs instance being used before fully initializaion.
FileSystem tmpHdfs = FileSystem.get(conf);
tmpHdfs.setVerifyChecksum(false);
hdfs = tmpHdfs;
} catch (IOException e) {
LOG.error("Error on creating hdfs FileSystem object.", e);
throw new ShifuException(ShifuErrorCode.ERROR_GET_HDFS_SYSTEM);
}
}
}
}
return hdfs;
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class StatsStep method process.
/*
* (non-Javadoc)
*
* @see ml.shifu.common.Step#process()
*/
@Override
public List<ColumnConfig> process() throws IOException {
LOG.info("Step Start: stats");
long start = System.currentTimeMillis();
try {
// User may change variable type after `shifu init`
ColumnConfigUpdater.updateColumnConfigFlags(this.modelConfig, this.columnConfigList, ModelStep.STATS);
LOG.info("Saving ModelConfig, ColumnConfig and then upload to HDFS ...");
JSONUtils.writeValue(new File(pathFinder.getModelConfigPath(SourceType.LOCAL)), modelConfig);
JSONUtils.writeValue(new File(pathFinder.getColumnConfigPath(SourceType.LOCAL)), columnConfigList);
if (SourceType.HDFS.equals(modelConfig.getDataSet().getSource())) {
CommonUtils.copyConfFromLocalToHDFS(modelConfig, this.pathFinder);
}
AbstractStatsExecutor statsExecutor = null;
if (modelConfig.isMapReduceRunMode()) {
if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.DynamicBinning)) {
statsExecutor = new DIBStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPat)) {
statsExecutor = new MunroPatStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.MunroPatI)) {
statsExecutor = new MunroPatIStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDT)) {
statsExecutor = new SPDTStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
} else if (modelConfig.getBinningAlgorithm().equals(ModelStatsConf.BinningAlgorithm.SPDTI)) {
statsExecutor = new SPDTIStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
} else {
statsExecutor = new SPDTIStatsExecutor(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
}
} else if (modelConfig.isLocalRunMode()) {
statsExecutor = new AkkaStatsWorker(new BasicModelProcessor(super.modelConfig, super.columnConfigList, super.otherConfigs), modelConfig, columnConfigList);
} else {
throw new ShifuException(ShifuErrorCode.ERROR_UNSUPPORT_MODE);
}
statsExecutor.doStats();
if (SourceType.HDFS.equals(modelConfig.getDataSet().getSource())) {
CommonUtils.copyConfFromLocalToHDFS(modelConfig, this.pathFinder);
}
} catch (Exception e) {
LOG.error("Error:", e);
}
LOG.info("Step Finished: stats with {} ms", (System.currentTimeMillis() - start));
return columnConfigList;
}
use of ml.shifu.shifu.exception.ShifuException in project shifu by ShifuML.
the class DataPrepareWorker method convertRawDataIntoValueObject.
/*
* Convert raw data into @ValueObject for calculating stats
*
* @param rawDataList
* - raw data for training
* @param columnVoListMap
* <column-id --> @ValueObject list>
* @throws ShifuException
* if the data field length is not equal header length
*/
private DataPrepareStatsResult convertRawDataIntoValueObject(List<String> rawDataList, Map<Integer, List<ValueObject>> columnVoListMap) throws ShifuException {
double sampleRate = modelConfig.getBinningSampleRate();
long total = 0l;
Map<Integer, Long> missingMap = new HashMap<Integer, Long>();
for (String line : rawDataList) {
total++;
String[] raw = CommonUtils.split(line, modelConfig.getDataSetDelimiter());
if (raw.length != columnConfigList.size()) {
log.error("Expected Columns: " + columnConfigList.size() + ", but got: " + raw.length);
throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
}
String tag = CommonUtils.trimTag(raw[targetColumnNum]);
if (modelConfig.isBinningSampleNegOnly()) {
if (modelConfig.getNegTags().contains(tag) && random.nextDouble() > sampleRate) {
continue;
}
} else {
if (random.nextDouble() > sampleRate) {
continue;
}
}
for (int i = 0; i < raw.length; i++) {
if (!columnNumToActorMap.containsKey(i)) {
// ignore non-used columns
continue;
}
ValueObject vo = new ValueObject();
if (i >= columnConfigList.size()) {
log.error("The input size is longer than expected, need to check your data");
continue;
}
ColumnConfig config = columnConfigList.get(i);
if (config.isNumerical()) {
// NUMERICAL
try {
vo.setValue(Double.valueOf(raw[i].trim()));
vo.setRaw(null);
} catch (Exception e) {
log.debug("Column " + config.getColumnNum() + ": " + config.getColumnName() + " is expected to be NUMERICAL, however received: " + raw[i]);
incMap(i, missingMap);
continue;
}
} else if (config.isCategorical()) {
// CATEGORICAL
if (raw[i] == null || StringUtils.isEmpty(raw[i]) || modelConfig.getDataSet().getMissingOrInvalidValues().contains(raw[i].toLowerCase().trim())) {
incMap(i, missingMap);
}
vo.setRaw(raw[i].trim());
vo.setValue(null);
} else {
// AUTO TYPE
try {
vo.setValue(Double.valueOf(raw[i]));
vo.setRaw(null);
} catch (Exception e) {
incMap(i, missingMap);
vo.setRaw(raw[i]);
vo.setValue(null);
}
}
if (this.weightedColumnNum != -1) {
try {
vo.setWeight(Double.valueOf(raw[weightedColumnNum]));
} catch (NumberFormatException e) {
vo.setWeight(1.0);
}
vo.setWeight(1.0);
}
vo.setTag(tag);
List<ValueObject> voList = columnVoListMap.get(i);
if (voList == null) {
voList = new ArrayList<ValueObject>();
columnVoListMap.put(i, voList);
}
voList.add(vo);
}
}
DataPrepareStatsResult rt = new DataPrepareStatsResult(total, missingMap);
return rt;
}
Aggregations