use of edu.neu.ccs.pyramid.feature.Feature in project pyramid by cheng-li.
the class DataSetUtil method sampleFeatures.
/**
* only keep the selected features
* @param dataSet
* @return
*/
public static ClfDataSet sampleFeatures(ClfDataSet dataSet, List<Integer> columnsToKeep) {
ClfDataSet trimmed;
int numClasses = dataSet.getNumClasses();
boolean missingValue = dataSet.hasMissingValue();
// keep density
if (dataSet.isDense()) {
trimmed = new DenseClfDataSet(dataSet.getNumDataPoints(), columnsToKeep.size(), missingValue, numClasses);
} else {
trimmed = new SparseClfDataSet(dataSet.getNumDataPoints(), columnsToKeep.size(), missingValue, numClasses);
}
for (int j = 0; j < trimmed.getNumFeatures(); j++) {
int oldColumnIndex = columnsToKeep.get(j);
Vector vector = dataSet.getColumn(oldColumnIndex);
for (Vector.Element element : vector.nonZeroes()) {
int dataPointIndex = element.index();
double value = element.get();
trimmed.setFeatureValue(dataPointIndex, j, value);
}
}
//copy labels
int[] labels = dataSet.getLabels();
for (int i = 0; i < trimmed.getNumDataPoints(); i++) {
trimmed.setLabel(i, labels[i]);
}
trimmed.setLabelTranslator(dataSet.getLabelTranslator());
trimmed.setIdTranslator(dataSet.getIdTranslator());
List<Feature> oldFeatures = dataSet.getFeatureList().getAll();
List<Feature> newFeatures = columnsToKeep.stream().map(oldFeatures::get).collect(Collectors.toList());
for (int i = 0; i < newFeatures.size(); i++) {
newFeatures.get(i).setIndex(i);
}
trimmed.setFeatureList(new FeatureList(newFeatures));
return trimmed;
}
use of edu.neu.ccs.pyramid.feature.Feature in project pyramid by cheng-li.
the class DataSetUtil method concatenateByColumn.
public static MultiLabelClfDataSet concatenateByColumn(MultiLabelClfDataSet dataSet1, MultiLabelClfDataSet dataSet2) {
int numDataPoints = dataSet1.getNumDataPoints();
int numFeatures1 = dataSet1.getNumFeatures();
int numFeatures2 = dataSet2.getNumFeatures();
int numFeatures = numFeatures1 + numFeatures2;
MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numDataPoints(numDataPoints).numFeatures(numFeatures).numClasses(dataSet1.getNumClasses()).density(dataSet1.density()).missingValue(dataSet1.hasMissingValue()).build();
int featureIndex = 0;
for (int j = 0; j < numFeatures1; j++) {
Vector vector = dataSet1.getColumn(j);
for (Vector.Element element : vector.nonZeroes()) {
int i = element.index();
double value = element.get();
dataSet.setFeatureValue(i, featureIndex, value);
}
featureIndex += 1;
}
for (int j = 0; j < numFeatures2; j++) {
Vector vector = dataSet2.getColumn(j);
for (Vector.Element element : vector.nonZeroes()) {
int i = element.index();
double value = element.get();
dataSet.setFeatureValue(i, featureIndex, value);
}
featureIndex += 1;
}
MultiLabel[] labels = dataSet1.getMultiLabels();
for (int i = 0; i < numDataPoints; i++) {
dataSet.setLabels(i, labels[i]);
}
FeatureList featureList = new FeatureList();
for (Feature feature : dataSet1.getFeatureList().getAll()) {
featureList.add(feature);
}
for (Feature feature : dataSet2.getFeatureList().getAll()) {
featureList.add(feature);
}
dataSet.setFeatureList(featureList);
dataSet.setLabelTranslator(dataSet1.getLabelTranslator());
dataSet.setIdTranslator(dataSet1.getIdTranslator());
return dataSet;
}
use of edu.neu.ccs.pyramid.feature.Feature in project pyramid by cheng-li.
the class DataSetUtil method sampleFeatures.
/**
* only keep the selected featureList
* @param dataSet
* @return
*/
public static MultiLabelClfDataSet sampleFeatures(MultiLabelClfDataSet dataSet, List<Integer> columnsToKeep) {
MultiLabelClfDataSet trimmed;
boolean missingValue = dataSet.hasMissingValue();
int numClasses = dataSet.getNumClasses();
// keep density
if (dataSet.isDense()) {
trimmed = new DenseMLClfDataSet(dataSet.getNumDataPoints(), columnsToKeep.size(), missingValue, numClasses);
} else {
trimmed = new SparseMLClfDataSet(dataSet.getNumDataPoints(), columnsToKeep.size(), missingValue, numClasses);
}
for (int j = 0; j < trimmed.getNumFeatures(); j++) {
int oldColumnIndex = columnsToKeep.get(j);
Vector vector = dataSet.getColumn(oldColumnIndex);
for (Vector.Element element : vector.nonZeroes()) {
int dataPointIndex = element.index();
double value = element.get();
trimmed.setFeatureValue(dataPointIndex, j, value);
}
}
//copy labels
MultiLabel[] multiLabels = dataSet.getMultiLabels();
for (int i = 0; i < trimmed.getNumDataPoints(); i++) {
trimmed.addLabels(i, multiLabels[i].getMatchedLabels());
}
//just copy settings
trimmed.setLabelTranslator(dataSet.getLabelTranslator());
trimmed.setIdTranslator(dataSet.getIdTranslator());
List<Feature> oldFeatures = dataSet.getFeatureList().getAll();
List<Feature> newFeatures = columnsToKeep.stream().map(oldFeatures::get).collect(Collectors.toList());
for (int i = 0; i < newFeatures.size(); i++) {
newFeatures.get(i).setIndex(i);
}
trimmed.setFeatureList(new FeatureList(newFeatures));
return trimmed;
}
use of edu.neu.ccs.pyramid.feature.Feature in project pyramid by cheng-li.
the class HMLGBInspector method topFeatures.
//todo: consider newton step and learning rate
/**
* only trees are considered
* @param boosting
* @param classIndex
* @return list of feature index and feature name pairs
*/
public static TopFeatures topFeatures(HMLGradientBoosting boosting, int classIndex, int limit) {
Map<Feature, Double> totalContributions = new HashMap<>();
List<Regressor> regressors = boosting.getRegressors(classIndex);
List<RegressionTree> trees = regressors.stream().filter(regressor -> regressor instanceof RegressionTree).map(regressor -> (RegressionTree) regressor).collect(Collectors.toList());
for (RegressionTree tree : trees) {
Map<Feature, Double> contributions = RegTreeInspector.featureImportance(tree);
for (Map.Entry<Feature, Double> entry : contributions.entrySet()) {
Feature feature = entry.getKey();
Double contribution = entry.getValue();
double oldValue = totalContributions.getOrDefault(feature, 0.0);
double newValue = oldValue + contribution;
totalContributions.put(feature, newValue);
}
}
Comparator<Map.Entry<Feature, Double>> comparator = Comparator.comparing(Map.Entry::getValue);
List<Feature> list = totalContributions.entrySet().stream().sorted(comparator.reversed()).limit(limit).map(Map.Entry::getKey).collect(Collectors.toList());
TopFeatures topFeatures = new TopFeatures();
topFeatures.setTopFeatures(list);
topFeatures.setClassIndex(classIndex);
LabelTranslator labelTranslator = boosting.getLabelTranslator();
topFeatures.setClassName(labelTranslator.toExtLabel(classIndex));
return topFeatures;
}
use of edu.neu.ccs.pyramid.feature.Feature in project pyramid by cheng-li.
the class MLLogisticRegressionInspector method topFeatures.
public static TopFeatures topFeatures(MLLogisticRegression logisticRegression, int classIndex, int limit) {
FeatureList featureList = logisticRegression.getFeatureList();
Vector weights = logisticRegression.getWeights().getWeightsWithoutBiasForClass(classIndex);
Comparator<FeatureUtility> comparator = Comparator.comparing(FeatureUtility::getUtility);
List<Feature> list = IntStream.range(0, weights.size()).mapToObj(i -> new FeatureUtility(featureList.get(i)).setUtility(weights.get(i))).filter(featureUtility -> featureUtility.getUtility() > 0).sorted(comparator.reversed()).map(FeatureUtility::getFeature).limit(limit).collect(Collectors.toList());
TopFeatures topFeatures = new TopFeatures();
topFeatures.setTopFeatures(list);
topFeatures.setClassIndex(classIndex);
LabelTranslator labelTranslator = logisticRegression.getLabelTranslator();
topFeatures.setClassName(labelTranslator.toExtLabel(classIndex));
return topFeatures;
}
Aggregations