use of edu.neu.ccs.pyramid.feature.FeatureList in project pyramid by cheng-li.
the class MekaFormat method loadMLClfDatasetPreDense.
private static MultiLabelClfDataSet loadMLClfDatasetPreDense(File file, int numClasses, int numFeatures, int numData, Map<String, String> labelMap, Map<String, String> featureMap) throws IOException {
MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numDataPoints(numData).numClasses(numClasses).numFeatures(numFeatures).build();
// set features
List<Feature> featureList = new LinkedList<>();
for (int m = 0; m < numFeatures; m++) {
String featureIndex = Integer.toString(m);
String featureName = featureMap.get(featureIndex);
Feature feature = new Feature();
feature.setIndex(m);
feature.setName(featureName);
featureList.add(feature);
}
dataSet.setFeatureList(new FeatureList(featureList));
// set Label
Map<Integer, String> labelIndexMap = new HashMap<>();
for (Map.Entry<String, String> entry : labelMap.entrySet()) {
String labelString = entry.getKey();
String labelName = entry.getValue();
labelIndexMap.put(Integer.parseInt(labelString) - numFeatures, labelName);
}
LabelTranslator labelTranslator = new LabelTranslator(labelIndexMap);
dataSet.setLabelTranslator(labelTranslator);
// create feature matrix
BufferedReader br = new BufferedReader(new FileReader(file));
String line;
int dataCount = 0;
boolean ifData = false;
while ((line = br.readLine()) != null) {
if (line.startsWith("@data")) {
ifData = true;
continue;
}
if (ifData) {
if ((line.startsWith("{")) && (line.endsWith("}"))) {
line = line.substring(1, line.length() - 1);
}
String[] indexValues = line.split(",");
int indexValueI = -2;
for (String indexValue : indexValues) {
indexValueI++;
if (indexValueI == -1) {
continue;
}
String index = Integer.toString(indexValueI);
String value = indexValue;
if (labelMap.containsKey(index)) {
double valueDouble = Double.parseDouble(value);
if (valueDouble == 1.0) {
dataSet.addLabel(dataCount, Integer.parseInt(index) - numFeatures);
}
} else if (featureMap.containsKey(index)) {
double valueDouble = Double.parseDouble(value);
int indexInt = Integer.parseInt(index);
dataSet.setFeatureValue(dataCount, indexInt, valueDouble);
} else {
throw new RuntimeException("Index not found in the line: " + line);
}
}
dataCount++;
}
}
br.close();
return dataSet;
}
use of edu.neu.ccs.pyramid.feature.FeatureList in project pyramid by cheng-li.
the class DataSetUtil method concatenateByColumn.
/**
* assuming they have different feature sets
* @param dataSet1
* @param dataSet2
* @return
*/
public static ClfDataSet concatenateByColumn(ClfDataSet dataSet1, ClfDataSet dataSet2) {
int numDataPoints = dataSet1.getNumDataPoints();
int numFeatures1 = dataSet1.getNumFeatures();
int numFeatures2 = dataSet2.getNumFeatures();
int numFeatures = numFeatures1 + numFeatures2;
ClfDataSet dataSet = ClfDataSetBuilder.getBuilder().numDataPoints(numDataPoints).numFeatures(numFeatures).numClasses(dataSet1.getNumClasses()).dense(dataSet1.isDense()).missingValue(dataSet1.hasMissingValue()).build();
int featureIndex = 0;
for (int j = 0; j < numFeatures1; j++) {
Vector vector = dataSet1.getColumn(j);
for (Vector.Element element : vector.nonZeroes()) {
int i = element.index();
double value = element.get();
dataSet.setFeatureValue(i, featureIndex, value);
}
featureIndex += 1;
}
for (int j = 0; j < numFeatures2; j++) {
Vector vector = dataSet2.getColumn(j);
for (Vector.Element element : vector.nonZeroes()) {
int i = element.index();
double value = element.get();
dataSet.setFeatureValue(i, featureIndex, value);
}
featureIndex += 1;
}
int[] labels = dataSet1.getLabels();
for (int i = 0; i < numDataPoints; i++) {
dataSet.setLabel(i, labels[i]);
}
FeatureList featureList = new FeatureList();
for (Feature feature : dataSet1.getFeatureList().getAll()) {
featureList.add(feature);
}
for (Feature feature : dataSet2.getFeatureList().getAll()) {
featureList.add(feature);
}
dataSet.setFeatureList(featureList);
dataSet.setLabelTranslator(dataSet1.getLabelTranslator());
dataSet.setIdTranslator(dataSet1.getIdTranslator());
return dataSet;
}
use of edu.neu.ccs.pyramid.feature.FeatureList in project pyramid by cheng-li.
the class DataSetUtil method sampleFeatures.
public static RegDataSet sampleFeatures(RegDataSet dataSet, List<Integer> columnsToKeep) {
RegDataSet trimmed;
trimmed = RegDataSetBuilder.getBuilder().numDataPoints(dataSet.getNumDataPoints()).numFeatures(columnsToKeep.size()).missingValue(dataSet.hasMissingValue()).dense(dataSet.isDense()).build();
for (int j = 0; j < trimmed.getNumFeatures(); j++) {
int oldColumnIndex = columnsToKeep.get(j);
Vector vector = dataSet.getColumn(oldColumnIndex);
for (Vector.Element element : vector.nonZeroes()) {
int dataPointIndex = element.index();
double value = element.get();
trimmed.setFeatureValue(dataPointIndex, j, value);
}
}
//copy labels
double[] labels = dataSet.getLabels();
for (int i = 0; i < trimmed.getNumDataPoints(); i++) {
trimmed.setLabel(i, labels[i]);
}
trimmed.setIdTranslator(dataSet.getIdTranslator());
List<Feature> oldFeatures = dataSet.getFeatureList().getAll();
List<Feature> newFeatures = columnsToKeep.stream().map(oldFeatures::get).collect(Collectors.toList());
for (int i = 0; i < newFeatures.size(); i++) {
newFeatures.get(i).setIndex(i);
}
trimmed.setFeatureList(new FeatureList(newFeatures));
return trimmed;
}
use of edu.neu.ccs.pyramid.feature.FeatureList in project pyramid by cheng-li.
the class RegTreeInspector method featureImportance.
/**
* pair contains feature name and reduction
* @param tree
* @return
*/
// public static Map<Integer, Pair<String,Double>> featureImportance(RegressionTree tree){
// List<Feature> featureList = tree.getFeatureList().getAll();
// Map<Integer, Pair<String,Double>> map = new HashMap<>();
// List<Node> nodes = tree.traverse();
// nodes.stream().filter(node -> !node.isLeaf())
// .forEach(node -> {
// int featureIndex = node.getFeatureIndex();
// String featureName = featureList.get(node.getFeatureIndex()).getName();
// double reduction = node.getReduction();
// Pair<String,Double> oldPair = map.getOrDefault(featureIndex, new Pair<>(featureName,0.0));
// Pair<String, Double> newPair = new Pair<>(featureName,oldPair.getSecond()+reduction);
// map.put(featureIndex, newPair);
// });
// return map;
// }
public static Map<Feature, Double> featureImportance(RegressionTree tree) {
FeatureList featureList = tree.getFeatureList();
Map<Feature, Double> map = new HashMap<>();
List<Node> nodes = tree.traverse();
nodes.stream().filter(node -> !node.isLeaf()).forEach(node -> {
int featureIndex = node.getFeatureIndex();
Feature feature = featureList.get(featureIndex);
double reduction = node.getReduction();
double oldValue = map.getOrDefault(feature, 0.0);
double newValue = reduction + oldValue;
map.put(feature, newValue);
});
return map;
}
use of edu.neu.ccs.pyramid.feature.FeatureList in project pyramid by cheng-li.
the class DataSetUtil method sampleFeatures.
/**
* only keep the selected features
* @param dataSet
* @return
*/
public static ClfDataSet sampleFeatures(ClfDataSet dataSet, List<Integer> columnsToKeep) {
ClfDataSet trimmed;
int numClasses = dataSet.getNumClasses();
boolean missingValue = dataSet.hasMissingValue();
// keep density
if (dataSet.isDense()) {
trimmed = new DenseClfDataSet(dataSet.getNumDataPoints(), columnsToKeep.size(), missingValue, numClasses);
} else {
trimmed = new SparseClfDataSet(dataSet.getNumDataPoints(), columnsToKeep.size(), missingValue, numClasses);
}
for (int j = 0; j < trimmed.getNumFeatures(); j++) {
int oldColumnIndex = columnsToKeep.get(j);
Vector vector = dataSet.getColumn(oldColumnIndex);
for (Vector.Element element : vector.nonZeroes()) {
int dataPointIndex = element.index();
double value = element.get();
trimmed.setFeatureValue(dataPointIndex, j, value);
}
}
//copy labels
int[] labels = dataSet.getLabels();
for (int i = 0; i < trimmed.getNumDataPoints(); i++) {
trimmed.setLabel(i, labels[i]);
}
trimmed.setLabelTranslator(dataSet.getLabelTranslator());
trimmed.setIdTranslator(dataSet.getIdTranslator());
List<Feature> oldFeatures = dataSet.getFeatureList().getAll();
List<Feature> newFeatures = columnsToKeep.stream().map(oldFeatures::get).collect(Collectors.toList());
for (int i = 0; i < newFeatures.size(); i++) {
newFeatures.get(i).setIndex(i);
}
trimmed.setFeatureList(new FeatureList(newFeatures));
return trimmed;
}
Aggregations