use of edu.neu.ccs.pyramid.feature.Feature in project pyramid by cheng-li.
the class TRECFormat method writeFeatureList.
private static void writeFeatureList(DataSet dataSet, File trecFile) {
File file = new File(trecFile, TREC_FEATURE_LIST_FILE_NAME);
FeatureList featureList = dataSet.getFeatureList();
try (FileOutputStream fileOutputStream = new FileOutputStream(file);
BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(fileOutputStream);
ObjectOutputStream objectOutputStream = new ObjectOutputStream(bufferedOutputStream)) {
objectOutputStream.writeObject(featureList);
} catch (IOException e) {
e.printStackTrace();
}
File txtFile = new File(trecFile, TREC_FEATURE_LIST_TEXT_FILE_NAME);
try (BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(txtFile))) {
for (Feature feature : featureList.getAll()) {
bufferedWriter.write(feature.toString());
bufferedWriter.newLine();
}
} catch (IOException e) {
e.printStackTrace();
}
}
use of edu.neu.ccs.pyramid.feature.Feature in project pyramid by cheng-li.
the class MekaFormat method loadMLClfDatasetDense.
private static MultiLabelClfDataSet loadMLClfDatasetDense(File file, int numClasses, int numFeatures, int numData, Map<String, String> labelMap, Map<String, String> featureMap) throws IOException {
MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numDataPoints(numData).numClasses(numClasses).numFeatures(numFeatures).build();
// set features
List<Feature> featureList = new LinkedList<>();
for (int m = 0; m < numFeatures; m++) {
String featureIndex = Integer.toString(m);
String featureName = featureMap.get(featureIndex);
Feature feature = new Feature();
feature.setIndex(m);
feature.setName(featureName);
featureList.add(feature);
}
dataSet.setFeatureList(new FeatureList(featureList));
// set Label
Map<Integer, String> labelIndexMap = new HashMap<>();
for (Map.Entry<String, String> entry : labelMap.entrySet()) {
String labelString = entry.getKey();
String labelName = entry.getValue();
labelIndexMap.put(Integer.parseInt(labelString) - numFeatures, labelName);
}
LabelTranslator labelTranslator = new LabelTranslator(labelIndexMap);
dataSet.setLabelTranslator(labelTranslator);
// create feature matrix
BufferedReader br = new BufferedReader(new FileReader(file));
String line;
int dataCount = 0;
boolean ifData = false;
while ((line = br.readLine()) != null) {
if (line.startsWith("@data")) {
ifData = true;
continue;
}
if (ifData) {
if ((line.startsWith("{")) && (line.endsWith("}"))) {
line = line.substring(1, line.length() - 1);
}
String[] indexValues = line.split(",");
int indexValueI = -1;
for (String indexValue : indexValues) {
indexValueI++;
String index = Integer.toString(indexValueI);
String value = indexValue;
if (labelMap.containsKey(index)) {
double valueDouble = Double.parseDouble(value);
if (valueDouble == 1.0) {
dataSet.addLabel(dataCount, Integer.parseInt(index) - numFeatures);
}
} else if (featureMap.containsKey(index)) {
double valueDouble = Double.parseDouble(value);
int indexInt = Integer.parseInt(index);
dataSet.setFeatureValue(dataCount, indexInt, valueDouble);
} else {
throw new RuntimeException("Index not found in the line: " + line);
}
}
dataCount++;
}
}
br.close();
return dataSet;
}
use of edu.neu.ccs.pyramid.feature.Feature in project pyramid by cheng-li.
the class MekaFormat method save.
public static void save(MultiLabelClfDataSet dataSet, String mekaFile, String dataName) throws IOException {
BufferedWriter bw = new BufferedWriter(new FileWriter(mekaFile));
// writing the header: @relation 'data_name: -C number_classes\n\n'
LabelTranslator labelTranslator = dataSet.getLabelTranslator();
int numClasses = labelTranslator.getNumClasses();
bw.write("@relation " + "'" + dataName + ": -C " + numClasses + "'\n\n");
// starting writing features
FeatureList featureList = dataSet.getFeatureList();
// Pattern pattern = Pattern.compile("ngram=(.*?), field");
for (int i = 0; i < featureList.size(); i++) {
Feature feature = featureList.get(i);
// String featureName = "";
// if (feature instanceof Ngram) {
// Ngram ngram = (Ngram) feature;
// featureName = ngram.getNgram();
// }
// if (featureName.length() == 0) {
// featureName = featureName+"f"+i;
// }
String featureName = "f" + i;
bw.write("@attribute " + featureName + " numeric\n");
}
// starting writing labels
for (int i = 0; i < numClasses; i++) {
String labelName = labelTranslator.toExtLabel(i);
bw.write("@attribute " + labelName.replace(" ", "_") + " {0,1}\n");
}
// starting @data
MultiLabel[] multiLabels = dataSet.getMultiLabels();
bw.write("\n@data\n\n");
for (int i = 0; i < dataSet.getNumDataPoints(); i++) {
StringBuffer stringBuffer = new StringBuffer();
stringBuffer.append("{");
Vector rowData = dataSet.getRow(i);
MultiLabel multiLabel = multiLabels[i];
// following by feature index
Map<Integer, Double> sortedKeys = new TreeMap<>();
for (Vector.Element element : rowData.nonZeroes()) {
int index = element.index();
double value = element.get();
sortedKeys.put(index, value);
}
int count = 0;
for (Map.Entry<Integer, Double> entry : sortedKeys.entrySet()) {
int index = entry.getKey();
double value = entry.getValue();
stringBuffer.append(index + " " + value);
count++;
if (count < sortedKeys.size()) {
stringBuffer.append(",");
}
}
//starting with labels index.
List<Integer> matchedLabels = multiLabel.getMatchedLabelsOrdered();
for (int j = 0; j < matchedLabels.size(); j++) {
int matchedLabel = matchedLabels.get(j) + dataSet.getNumFeatures();
stringBuffer.append("," + matchedLabel + " " + "1");
}
stringBuffer.append("}\n");
bw.write(stringBuffer.toString());
}
bw.close();
}
use of edu.neu.ccs.pyramid.feature.Feature in project pyramid by cheng-li.
the class MekaFormat method loadMLClfDataset.
private static MultiLabelClfDataSet loadMLClfDataset(File file, int numClasses, int numFeatures, int numData, Map<String, String> labelMap, Map<String, String> featureMap) throws IOException {
MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numDataPoints(numData).numClasses(numClasses).numFeatures(numFeatures).build();
// set features
List<Feature> featureList = new LinkedList<>();
for (int m = 0; m < numFeatures; m++) {
String featureIndex = Integer.toString(m);
String featureName = featureMap.get(featureIndex);
Feature feature = new Feature();
feature.setIndex(m);
feature.setName(featureName);
featureList.add(feature);
}
dataSet.setFeatureList(new FeatureList(featureList));
// set Label
Map<Integer, String> labelIndexMap = new HashMap<>();
for (Map.Entry<String, String> entry : labelMap.entrySet()) {
String labelString = entry.getKey();
String labelName = entry.getValue();
labelIndexMap.put(Integer.parseInt(labelString) - numFeatures, labelName);
}
LabelTranslator labelTranslator = new LabelTranslator(labelIndexMap);
dataSet.setLabelTranslator(labelTranslator);
// create feature matrix
BufferedReader br = new BufferedReader(new FileReader(file));
String line;
int dataCount = 0;
while ((line = br.readLine()) != null) {
if ((line.startsWith("{")) && (line.endsWith("}"))) {
line = line.substring(1, line.length() - 1);
String[] indexValues = line.split(", ");
for (String indexValue : indexValues) {
String[] indexValuePair = indexValue.split(" ");
String index = indexValuePair[0];
String value = indexValuePair[1];
if (labelMap.containsKey(index)) {
double valueDouble = Double.parseDouble(value);
if (valueDouble == 1.0) {
dataSet.addLabel(dataCount, Integer.parseInt(index) - numFeatures);
}
} else if (featureMap.containsKey(index)) {
double valueDouble = Double.parseDouble(value);
int indexInt = Integer.parseInt(index);
dataSet.setFeatureValue(dataCount, indexInt, valueDouble);
} else {
throw new RuntimeException("Index:" + index + " not found in the line: " + line);
}
}
dataCount++;
}
}
br.close();
return dataSet;
}
use of edu.neu.ccs.pyramid.feature.Feature in project pyramid by cheng-li.
the class MekaFormat method loadMLClfDatasetPreDense.
private static MultiLabelClfDataSet loadMLClfDatasetPreDense(File file, int numClasses, int numFeatures, int numData, Map<String, String> labelMap, Map<String, String> featureMap) throws IOException {
MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numDataPoints(numData).numClasses(numClasses).numFeatures(numFeatures).build();
// set features
List<Feature> featureList = new LinkedList<>();
for (int m = 0; m < numFeatures; m++) {
String featureIndex = Integer.toString(m);
String featureName = featureMap.get(featureIndex);
Feature feature = new Feature();
feature.setIndex(m);
feature.setName(featureName);
featureList.add(feature);
}
dataSet.setFeatureList(new FeatureList(featureList));
// set Label
Map<Integer, String> labelIndexMap = new HashMap<>();
for (Map.Entry<String, String> entry : labelMap.entrySet()) {
String labelString = entry.getKey();
String labelName = entry.getValue();
labelIndexMap.put(Integer.parseInt(labelString) - numFeatures, labelName);
}
LabelTranslator labelTranslator = new LabelTranslator(labelIndexMap);
dataSet.setLabelTranslator(labelTranslator);
// create feature matrix
BufferedReader br = new BufferedReader(new FileReader(file));
String line;
int dataCount = 0;
boolean ifData = false;
while ((line = br.readLine()) != null) {
if (line.startsWith("@data")) {
ifData = true;
continue;
}
if (ifData) {
if ((line.startsWith("{")) && (line.endsWith("}"))) {
line = line.substring(1, line.length() - 1);
}
String[] indexValues = line.split(",");
int indexValueI = -2;
for (String indexValue : indexValues) {
indexValueI++;
if (indexValueI == -1) {
continue;
}
String index = Integer.toString(indexValueI);
String value = indexValue;
if (labelMap.containsKey(index)) {
double valueDouble = Double.parseDouble(value);
if (valueDouble == 1.0) {
dataSet.addLabel(dataCount, Integer.parseInt(index) - numFeatures);
}
} else if (featureMap.containsKey(index)) {
double valueDouble = Double.parseDouble(value);
int indexInt = Integer.parseInt(index);
dataSet.setFeatureValue(dataCount, indexInt, valueDouble);
} else {
throw new RuntimeException("Index not found in the line: " + line);
}
}
dataCount++;
}
}
br.close();
return dataSet;
}
Aggregations