Search in sources :

Example 6 with BasicML

use of org.encog.ml.BasicML in project shifu by ShifuML.

the class NNOutput method writeBinaryModelWeightsToFileSystem.

private void writeBinaryModelWeightsToFileSystem(double[] weights, Path out) {
    LOG.info("Writing NN models to {}.", out);
    this.network.getFlat().setWeights(weights);
    BasicML basicML = this.network;
    try {
        BinaryNNSerializer.save(modelConfig, columnConfigList, Arrays.asList(basicML), FileSystem.get(new Configuration()), out);
    } catch (IOException e) {
        LOG.error("Error in writing model", e);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) BasicML(org.encog.ml.BasicML) IOException(java.io.IOException)

Example 7 with BasicML

use of org.encog.ml.BasicML in project shifu by ShifuML.

the class EvalScoreUDF method exec.

@SuppressWarnings("deprecation")
public Tuple exec(Tuple input) throws IOException {
    if (isCsvFormat) {
        String firstCol = ((input.get(0) == null) ? "" : input.get(0).toString());
        if (this.headers[0].equals(CommonUtils.normColumnName(firstCol))) {
            // TODO what to do if the column value == column name? ...
            return null;
        }
    }
    long start = System.currentTimeMillis();
    if (this.modelRunner == null) {
        // here to initialize modelRunner, this is moved from constructor to here to avoid OOM in client side.
        // UDF in pig client will be initialized to get some metadata issues
        List<BasicML> models = ModelSpecLoaderUtils.loadBasicModels(modelConfig, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
        this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
        List<ModelSpec> subModels = ModelSpecLoaderUtils.loadSubModels(modelConfig, this.columnConfigList, evalConfig, evalConfig.getDataSet().getSource(), evalConfig.getGbtConvertToProb(), evalConfig.getGbtScoreConvertStrategy());
        if (CollectionUtils.isNotEmpty(subModels)) {
            for (ModelSpec modelSpec : subModels) {
                this.modelRunner.addSubModels(modelSpec, this.isMultiThreadScoring);
                this.subModelsCnt.put(modelSpec.getModelName(), modelSpec.getModels().size());
            }
        }
        this.modelCnt = models.size();
        // reset models in classfication case
        if (modelConfig.isClassification()) {
            if (modelConfig.getTrain().isOneVsAll()) {
                if (modelConfig.getTags().size() == 2) {
                    // onevsall, modelcnt is 1
                    this.modelCnt = 1;
                } else {
                    this.modelCnt = modelConfig.getTags().size();
                }
            } else {
                if (modelConfig.getTags().size() == 2) {
                    // native binary
                    this.modelCnt = 1;
                } else {
                    // native multiple classification model cnt is bagging num
                    this.modelCnt = (this.modelCnt >= modelConfig.getBaggingNum() ? modelConfig.getBaggingNum() : this.modelCnt);
                }
            }
            // reset models to
            models = models.subList(0, this.modelCnt);
            this.modelRunner = new ModelRunner(modelConfig, columnConfigList, this.headers, evalConfig.getDataSet().getDataDelimiter(), models, this.outputHiddenLayerIndex, this.isMultiThreadScoring);
        }
        this.modelRunner.setScoreScale(Integer.parseInt(this.scale));
        log.info("DEBUG: model cnt " + this.modelCnt + " sub models cnt " + modelRunner.getSubModelsCnt());
    }
    Map<NSColumn, String> rawDataNsMap = CommonUtils.convertDataIntoNsMap(input, this.headers, this.segFilterSize);
    if (MapUtils.isEmpty(rawDataNsMap)) {
        return null;
    }
    String tag = CommonUtils.trimTag(rawDataNsMap.get(new NSColumn(modelConfig.getTargetColumnName(evalConfig))));
    // filter invalid tag record out
    // disable the tag check, since there is no bad tag in eval data set
    // and user just want to score the data, but don't run performance evaluation
    /*
         * if(!tagSet.contains(tag)) {
         * if(System.currentTimeMillis() % 100 == 0) {
         * log.warn("Invalid tag: " + tag);
         * }
         * if(isPigEnabled(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")) {
         * PigStatusReporter.getInstance().getCounter(Constants.SHIFU_GROUP_COUNTER, Constants.COUNTER_RECORDS)
         * .increment(1);
         * }
         * return null;
         * }
         */
    long startTime = System.nanoTime();
    CaseScoreResult cs = modelRunner.computeNsData(rawDataNsMap);
    long runInterval = (System.nanoTime() - startTime) / 1000L;
    if (cs == null) {
        if (System.currentTimeMillis() % 100 == 0) {
            log.warn("Get null result, for input: " + input.toDelimitedString("|"));
        }
        return null;
    }
    Tuple tuple = TupleFactory.getInstance().newTuple();
    tuple.append(tag);
    String weight = null;
    if (StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName())) {
        weight = rawDataNsMap.get(new NSColumn(evalConfig.getDataSet().getWeightColumnName()));
    } else {
        weight = "1.0";
    }
    incrementTagCounters(tag, weight, runInterval);
    Map<String, CaseScoreResult> subModelScores = cs.getSubModelScores();
    tuple.append(weight);
    if (this.isLinearTarget || modelConfig.isRegression()) {
        if (CollectionUtils.isNotEmpty(cs.getScores())) {
            appendModelScore(tuple, cs, true);
            if (this.outputHiddenLayerIndex != 0) {
                appendFirstHiddenOutputScore(tuple, cs.getHiddenLayerScores(), true);
            }
        }
        if (MapUtils.isNotEmpty(subModelScores)) {
            Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry<String, CaseScoreResult> entry = iterator.next();
                CaseScoreResult subCs = entry.getValue();
                appendModelScore(tuple, subCs, false);
            }
        }
    } else {
        if (CollectionUtils.isNotEmpty(cs.getScores())) {
            appendSimpleScore(tuple, cs);
            tuple.append(this.mcPredictor.predictTag(cs).getTag());
        }
        if (MapUtils.isNotEmpty(subModelScores)) {
            Iterator<Map.Entry<String, CaseScoreResult>> iterator = subModelScores.entrySet().iterator();
            while (iterator.hasNext()) {
                Map.Entry<String, CaseScoreResult> entry = iterator.next();
                CaseScoreResult subCs = entry.getValue();
                appendSimpleScore(tuple, subCs);
            }
        }
    }
    // append meta data
    List<String> metaColumns = evalConfig.getAllMetaColumns(modelConfig);
    if (CollectionUtils.isNotEmpty(metaColumns)) {
        for (String meta : metaColumns) {
            tuple.append(rawDataNsMap.get(new NSColumn(meta)));
        }
    }
    if (System.currentTimeMillis() % 1000 == 0L) {
        log.info("running time is " + (System.currentTimeMillis() - start) + " ms.");
    }
    return tuple;
}
Also used : BasicML(org.encog.ml.BasicML) CaseScoreResult(ml.shifu.shifu.container.CaseScoreResult) Entry(java.util.Map.Entry) ModelSpec(ml.shifu.shifu.core.model.ModelSpec) Map(java.util.Map) SortedMap(java.util.SortedMap) Tuple(org.apache.pig.data.Tuple) ModelRunner(ml.shifu.shifu.core.ModelRunner) NSColumn(ml.shifu.shifu.column.NSColumn)

Example 8 with BasicML

use of org.encog.ml.BasicML in project shifu by ShifuML.

the class ModelSpecLoaderUtils method loadBasicModels.

/**
 * Load basic models by configuration
 *
 * @param modelConfig
 *            ModelConfig
 * @param evalConfig
 *            eval configuration
 * @param sourceType
 *            source type
 * @return the list of models
 * @throws IOException
 *             Exception when fail to locate or load models
 */
public static List<BasicML> loadBasicModels(ModelConfig modelConfig, EvalConfig evalConfig, SourceType sourceType) throws IOException {
    FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType);
    List<BasicML> models = new ArrayList<BasicML>();
    List<FileStatus> modelFileStats = locateBasicModels(modelConfig, evalConfig, sourceType);
    if (CollectionUtils.isNotEmpty(modelFileStats)) {
        for (FileStatus f : modelFileStats) {
            models.add(loadModel(modelConfig, f.getPath(), fs));
        }
    }
    return models;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) BasicML(org.encog.ml.BasicML)

Example 9 with BasicML

use of org.encog.ml.BasicML in project shifu by ShifuML.

the class ModelSpecLoaderUtils method loadBasicModels.

/**
 * Load neural network models from specified file path
 *
 * @param modelsPath
 *            - a file or directory that contains .nn files
 * @param alg
 *            the algorithm
 * @param isConvertToProb
 *            if convert to prob for gbt model
 * @param gbtScoreConvertStrategy
 *            specify how to convert gbt raw score
 * @return - a list of @BasicML
 * @throws IOException
 *             - throw exception when loading model files
 */
public static List<BasicML> loadBasicModels(final String modelsPath, final ALGORITHM alg, boolean isConvertToProb, String gbtScoreConvertStrategy) throws IOException {
    if (modelsPath == null || alg == null || ALGORITHM.DT.equals(alg)) {
        throw new IllegalArgumentException("The model path shouldn't be null");
    }
    // we have to register PersistBasicFloatNetwork for loading such models
    if (ALGORITHM.NN.equals(alg)) {
        PersistorRegistry.getInstance().add(new PersistBasicFloatNetwork());
    }
    File modelsPathDir = new File(modelsPath);
    File[] modelFiles = modelsPathDir.listFiles(new FilenameFilter() {

        @Override
        public boolean accept(File dir, String name) {
            return name.endsWith("." + alg.name().toLowerCase());
        }
    });
    if (modelFiles != null) {
        // sort file names
        Arrays.sort(modelFiles, new Comparator<File>() {

            @Override
            public int compare(File from, File to) {
                return from.getName().compareTo(to.getName());
            }
        });
        List<BasicML> models = new ArrayList<BasicML>(modelFiles.length);
        for (File nnf : modelFiles) {
            InputStream is = null;
            try {
                is = new FileInputStream(nnf);
                if (ALGORITHM.NN.equals(alg)) {
                    GzipStreamPair pair = GzipStreamPair.isGZipFormat(is);
                    if (pair.isGzip()) {
                        models.add(BasicML.class.cast(NNModel.loadFromStream(pair.getInput())));
                    } else {
                        models.add(BasicML.class.cast(EncogDirectoryPersistence.loadObject(pair.getInput())));
                    }
                } else if (ALGORITHM.LR.equals(alg)) {
                    models.add(LR.loadFromStream(is));
                } else if (ALGORITHM.GBT.equals(alg) || ALGORITHM.RF.equals(alg)) {
                    models.add(TreeModel.loadFromStream(is, isConvertToProb, gbtScoreConvertStrategy));
                }
            } finally {
                IOUtils.closeQuietly(is);
            }
        }
        return models;
    } else {
        throw new IOException(String.format("Failed to list files in %s", modelsPathDir.getAbsolutePath()));
    }
}
Also used : FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) BasicML(org.encog.ml.BasicML) PersistBasicFloatNetwork(ml.shifu.shifu.core.dtrain.dataset.PersistBasicFloatNetwork)

Example 10 with BasicML

use of org.encog.ml.BasicML in project shifu by ShifuML.

the class CommonUtils method computeTreeModelFeatureImportance.

/**
 * Compute feature importance for all bagging tree models.
 *
 * @param models
 *            the tree models, should be instance of TreeModel
 * @return feature importance per each column id
 * @throws IllegalStateException
 *             if no any feature importance from models
 */
public static Map<Integer, MutablePair<String, Double>> computeTreeModelFeatureImportance(List<BasicML> models) {
    List<Map<Integer, MutablePair<String, Double>>> importanceList = new ArrayList<Map<Integer, MutablePair<String, Double>>>();
    for (BasicML basicModel : models) {
        if (basicModel instanceof TreeModel) {
            TreeModel model = (TreeModel) basicModel;
            Map<Integer, MutablePair<String, Double>> importances = model.getFeatureImportances();
            importanceList.add(importances);
        }
    }
    if (importanceList.size() < 1) {
        throw new IllegalStateException("Feature importance calculation abort due to no tree model found!!");
    }
    return mergeImportanceList(importanceList);
}
Also used : MutablePair(org.apache.commons.lang3.tuple.MutablePair) TreeModel(ml.shifu.shifu.core.TreeModel) BasicML(org.encog.ml.BasicML)

Aggregations

BasicML (org.encog.ml.BasicML)23 File (java.io.File)6 BasicNetwork (org.encog.neural.networks.BasicNetwork)5 IOException (java.io.IOException)4 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)4 BasicFloatNetwork (ml.shifu.shifu.core.dtrain.dataset.BasicFloatNetwork)4 PersistBasicFloatNetwork (ml.shifu.shifu.core.dtrain.dataset.PersistBasicFloatNetwork)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 FlatNetwork (org.encog.neural.flat.FlatNetwork)4 ArrayList (java.util.ArrayList)3 NSColumn (ml.shifu.shifu.column.NSColumn)3 ModelRunner (ml.shifu.shifu.core.ModelRunner)3 ModelSpec (ml.shifu.shifu.core.model.ModelSpec)3 MutablePair (org.apache.commons.lang3.tuple.MutablePair)3 Configuration (org.apache.hadoop.conf.Configuration)3 FileStatus (org.apache.hadoop.fs.FileStatus)3 Path (org.apache.hadoop.fs.Path)3 JarFile (java.util.jar.JarFile)2 CaseScoreResult (ml.shifu.shifu.container.CaseScoreResult)2 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)2