Search in sources :

Example 6 with FeatureList

use of edu.neu.ccs.pyramid.feature.FeatureList in project pyramid by cheng-li.

the class MekaFormat method loadMLClfDatasetPreDense.

private static MultiLabelClfDataSet loadMLClfDatasetPreDense(File file, int numClasses, int numFeatures, int numData, Map<String, String> labelMap, Map<String, String> featureMap) throws IOException {
    MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numDataPoints(numData).numClasses(numClasses).numFeatures(numFeatures).build();
    // set features
    List<Feature> featureList = new LinkedList<>();
    for (int m = 0; m < numFeatures; m++) {
        String featureIndex = Integer.toString(m);
        String featureName = featureMap.get(featureIndex);
        Feature feature = new Feature();
        feature.setIndex(m);
        feature.setName(featureName);
        featureList.add(feature);
    }
    dataSet.setFeatureList(new FeatureList(featureList));
    // set Label
    Map<Integer, String> labelIndexMap = new HashMap<>();
    for (Map.Entry<String, String> entry : labelMap.entrySet()) {
        String labelString = entry.getKey();
        String labelName = entry.getValue();
        labelIndexMap.put(Integer.parseInt(labelString) - numFeatures, labelName);
    }
    LabelTranslator labelTranslator = new LabelTranslator(labelIndexMap);
    dataSet.setLabelTranslator(labelTranslator);
    // create feature matrix
    BufferedReader br = new BufferedReader(new FileReader(file));
    String line;
    int dataCount = 0;
    boolean ifData = false;
    while ((line = br.readLine()) != null) {
        if (line.startsWith("@data")) {
            ifData = true;
            continue;
        }
        if (ifData) {
            if ((line.startsWith("{")) && (line.endsWith("}"))) {
                line = line.substring(1, line.length() - 1);
            }
            String[] indexValues = line.split(",");
            int indexValueI = -2;
            for (String indexValue : indexValues) {
                indexValueI++;
                if (indexValueI == -1) {
                    continue;
                }
                String index = Integer.toString(indexValueI);
                String value = indexValue;
                if (labelMap.containsKey(index)) {
                    double valueDouble = Double.parseDouble(value);
                    if (valueDouble == 1.0) {
                        dataSet.addLabel(dataCount, Integer.parseInt(index) - numFeatures);
                    }
                } else if (featureMap.containsKey(index)) {
                    double valueDouble = Double.parseDouble(value);
                    int indexInt = Integer.parseInt(index);
                    dataSet.setFeatureValue(dataCount, indexInt, valueDouble);
                } else {
                    throw new RuntimeException("Index not found in the line: " + line);
                }
            }
            dataCount++;
        }
    }
    br.close();
    return dataSet;
}
Also used : Feature(edu.neu.ccs.pyramid.feature.Feature) FeatureList(edu.neu.ccs.pyramid.feature.FeatureList)

Example 7 with FeatureList

use of edu.neu.ccs.pyramid.feature.FeatureList in project pyramid by cheng-li.

the class DataSetUtil method concatenateByColumn.

/**
     * assuming they have different feature sets
     * @param dataSet1
     * @param dataSet2
     * @return
     */
public static ClfDataSet concatenateByColumn(ClfDataSet dataSet1, ClfDataSet dataSet2) {
    int numDataPoints = dataSet1.getNumDataPoints();
    int numFeatures1 = dataSet1.getNumFeatures();
    int numFeatures2 = dataSet2.getNumFeatures();
    int numFeatures = numFeatures1 + numFeatures2;
    ClfDataSet dataSet = ClfDataSetBuilder.getBuilder().numDataPoints(numDataPoints).numFeatures(numFeatures).numClasses(dataSet1.getNumClasses()).dense(dataSet1.isDense()).missingValue(dataSet1.hasMissingValue()).build();
    int featureIndex = 0;
    for (int j = 0; j < numFeatures1; j++) {
        Vector vector = dataSet1.getColumn(j);
        for (Vector.Element element : vector.nonZeroes()) {
            int i = element.index();
            double value = element.get();
            dataSet.setFeatureValue(i, featureIndex, value);
        }
        featureIndex += 1;
    }
    for (int j = 0; j < numFeatures2; j++) {
        Vector vector = dataSet2.getColumn(j);
        for (Vector.Element element : vector.nonZeroes()) {
            int i = element.index();
            double value = element.get();
            dataSet.setFeatureValue(i, featureIndex, value);
        }
        featureIndex += 1;
    }
    int[] labels = dataSet1.getLabels();
    for (int i = 0; i < numDataPoints; i++) {
        dataSet.setLabel(i, labels[i]);
    }
    FeatureList featureList = new FeatureList();
    for (Feature feature : dataSet1.getFeatureList().getAll()) {
        featureList.add(feature);
    }
    for (Feature feature : dataSet2.getFeatureList().getAll()) {
        featureList.add(feature);
    }
    dataSet.setFeatureList(featureList);
    dataSet.setLabelTranslator(dataSet1.getLabelTranslator());
    dataSet.setIdTranslator(dataSet1.getIdTranslator());
    return dataSet;
}
Also used : FeatureList(edu.neu.ccs.pyramid.feature.FeatureList) Vector(org.apache.mahout.math.Vector) Feature(edu.neu.ccs.pyramid.feature.Feature)

Example 8 with FeatureList

use of edu.neu.ccs.pyramid.feature.FeatureList in project pyramid by cheng-li.

the class DataSetUtil method sampleFeatures.

public static RegDataSet sampleFeatures(RegDataSet dataSet, List<Integer> columnsToKeep) {
    RegDataSet trimmed;
    trimmed = RegDataSetBuilder.getBuilder().numDataPoints(dataSet.getNumDataPoints()).numFeatures(columnsToKeep.size()).missingValue(dataSet.hasMissingValue()).dense(dataSet.isDense()).build();
    for (int j = 0; j < trimmed.getNumFeatures(); j++) {
        int oldColumnIndex = columnsToKeep.get(j);
        Vector vector = dataSet.getColumn(oldColumnIndex);
        for (Vector.Element element : vector.nonZeroes()) {
            int dataPointIndex = element.index();
            double value = element.get();
            trimmed.setFeatureValue(dataPointIndex, j, value);
        }
    }
    //copy labels
    double[] labels = dataSet.getLabels();
    for (int i = 0; i < trimmed.getNumDataPoints(); i++) {
        trimmed.setLabel(i, labels[i]);
    }
    trimmed.setIdTranslator(dataSet.getIdTranslator());
    List<Feature> oldFeatures = dataSet.getFeatureList().getAll();
    List<Feature> newFeatures = columnsToKeep.stream().map(oldFeatures::get).collect(Collectors.toList());
    for (int i = 0; i < newFeatures.size(); i++) {
        newFeatures.get(i).setIndex(i);
    }
    trimmed.setFeatureList(new FeatureList(newFeatures));
    return trimmed;
}
Also used : FeatureList(edu.neu.ccs.pyramid.feature.FeatureList) Vector(org.apache.mahout.math.Vector) Feature(edu.neu.ccs.pyramid.feature.Feature)

Example 9 with FeatureList

use of edu.neu.ccs.pyramid.feature.FeatureList in project pyramid by cheng-li.

the class RegTreeInspector method featureImportance.

/**
     * pair contains feature name and reduction
     * @param tree
     * @return
     */
//    public static Map<Integer, Pair<String,Double>> featureImportance(RegressionTree tree){
//        List<Feature> featureList = tree.getFeatureList().getAll();
//        Map<Integer, Pair<String,Double>> map = new HashMap<>();
//        List<Node> nodes = tree.traverse();
//        nodes.stream().filter(node -> !node.isLeaf())
//                .forEach(node -> {
//                    int featureIndex = node.getFeatureIndex();
//                    String featureName = featureList.get(node.getFeatureIndex()).getName();
//                    double reduction = node.getReduction();
//                    Pair<String,Double> oldPair = map.getOrDefault(featureIndex, new Pair<>(featureName,0.0));
//                    Pair<String, Double> newPair = new Pair<>(featureName,oldPair.getSecond()+reduction);
//                    map.put(featureIndex, newPair);
//                });
//        return map;
//    }
public static Map<Feature, Double> featureImportance(RegressionTree tree) {
    FeatureList featureList = tree.getFeatureList();
    Map<Feature, Double> map = new HashMap<>();
    List<Node> nodes = tree.traverse();
    nodes.stream().filter(node -> !node.isLeaf()).forEach(node -> {
        int featureIndex = node.getFeatureIndex();
        Feature feature = featureList.get(featureIndex);
        double reduction = node.getReduction();
        double oldValue = map.getOrDefault(feature, 0.0);
        double newValue = reduction + oldValue;
        map.put(feature, newValue);
    });
    return map;
}
Also used : java.util(java.util) Feature(edu.neu.ccs.pyramid.feature.Feature) Vector(org.apache.mahout.math.Vector) FeatureList(edu.neu.ccs.pyramid.feature.FeatureList) Collectors(java.util.stream.Collectors) Pair(edu.neu.ccs.pyramid.util.Pair) FeatureList(edu.neu.ccs.pyramid.feature.FeatureList) Feature(edu.neu.ccs.pyramid.feature.Feature)

Example 10 with FeatureList

use of edu.neu.ccs.pyramid.feature.FeatureList in project pyramid by cheng-li.

the class DataSetUtil method sampleFeatures.

/**
     * only keep the selected features
     * @param dataSet
     * @return
     */
public static ClfDataSet sampleFeatures(ClfDataSet dataSet, List<Integer> columnsToKeep) {
    ClfDataSet trimmed;
    int numClasses = dataSet.getNumClasses();
    boolean missingValue = dataSet.hasMissingValue();
    // keep density
    if (dataSet.isDense()) {
        trimmed = new DenseClfDataSet(dataSet.getNumDataPoints(), columnsToKeep.size(), missingValue, numClasses);
    } else {
        trimmed = new SparseClfDataSet(dataSet.getNumDataPoints(), columnsToKeep.size(), missingValue, numClasses);
    }
    for (int j = 0; j < trimmed.getNumFeatures(); j++) {
        int oldColumnIndex = columnsToKeep.get(j);
        Vector vector = dataSet.getColumn(oldColumnIndex);
        for (Vector.Element element : vector.nonZeroes()) {
            int dataPointIndex = element.index();
            double value = element.get();
            trimmed.setFeatureValue(dataPointIndex, j, value);
        }
    }
    //copy labels
    int[] labels = dataSet.getLabels();
    for (int i = 0; i < trimmed.getNumDataPoints(); i++) {
        trimmed.setLabel(i, labels[i]);
    }
    trimmed.setLabelTranslator(dataSet.getLabelTranslator());
    trimmed.setIdTranslator(dataSet.getIdTranslator());
    List<Feature> oldFeatures = dataSet.getFeatureList().getAll();
    List<Feature> newFeatures = columnsToKeep.stream().map(oldFeatures::get).collect(Collectors.toList());
    for (int i = 0; i < newFeatures.size(); i++) {
        newFeatures.get(i).setIndex(i);
    }
    trimmed.setFeatureList(new FeatureList(newFeatures));
    return trimmed;
}
Also used : FeatureList(edu.neu.ccs.pyramid.feature.FeatureList) Vector(org.apache.mahout.math.Vector) Feature(edu.neu.ccs.pyramid.feature.Feature)

Aggregations

FeatureList (edu.neu.ccs.pyramid.feature.FeatureList)14 Feature (edu.neu.ccs.pyramid.feature.Feature)13 Vector (org.apache.mahout.math.Vector)8 Collectors (java.util.stream.Collectors)2 ClassProbability (edu.neu.ccs.pyramid.classification.ClassProbability)1 PredictionAnalysis (edu.neu.ccs.pyramid.classification.PredictionAnalysis)1 LogisticRegression (edu.neu.ccs.pyramid.classification.logistic_regression.LogisticRegression)1 edu.neu.ccs.pyramid.dataset (edu.neu.ccs.pyramid.dataset)1 FeatureUtility (edu.neu.ccs.pyramid.feature.FeatureUtility)1 TopFeatures (edu.neu.ccs.pyramid.feature.TopFeatures)1 MultiLabelPredictionAnalysis (edu.neu.ccs.pyramid.multilabel_classification.MultiLabelPredictionAnalysis)1 IMLGradientBoosting (edu.neu.ccs.pyramid.multilabel_classification.imlgb.IMLGradientBoosting)1 ClassScoreCalculation (edu.neu.ccs.pyramid.regression.ClassScoreCalculation)1 ConstantRule (edu.neu.ccs.pyramid.regression.ConstantRule)1 LinearRule (edu.neu.ccs.pyramid.regression.LinearRule)1 Rule (edu.neu.ccs.pyramid.regression.Rule)1 Pair (edu.neu.ccs.pyramid.util.Pair)1 java.util (java.util)1 ArrayList (java.util.ArrayList)1 Comparator (java.util.Comparator)1