use of smile.data.Datum in project smile by haifengl.
the class GCTParser method parse.
/**
* Parse a GCT dataset from an input stream.
* @param name the name of dataset.
* @param stream the input stream of data.
* @throws java.io.IOException
*/
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String line = reader.readLine();
if (line == null) {
throw new IOException("Empty data source.");
}
if (!line.equals("#1.2")) {
throw new IOException("Invalid version.");
}
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
String[] tokens = line.split("\t", -1);
if (tokens.length != 2) {
throw new IOException("Invalid data size inforamation.");
}
int n = Integer.parseInt(tokens[0]);
int p = Integer.parseInt(tokens[1]);
if (n <= 0 || p <= 0) {
throw new IOException(String.format("Invalid data size %d x %d.", n, p));
}
Attribute[] attributes = new Attribute[p];
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
tokens = line.split("\t", -1);
if (tokens.length != p + 2) {
throw new IOException("Invalid title header.");
}
for (int i = 0; i < p; i++) {
attributes[i] = new NumericAttribute(tokens[i + 2]);
}
AttributeDataset data = new AttributeDataset(name, attributes);
for (int i = 0; i < n; i++) {
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
tokens = line.split("\t", -1);
if (tokens.length != p + 2) {
throw new IOException(String.format("Invalid number of elements of line %d: %d", i + 4, tokens.length));
}
double[] x = new double[p];
for (int j = 0; j < p; j++) {
if (tokens[j + 2].isEmpty()) {
x[j] = Double.NaN;
} else {
x[j] = Double.valueOf(tokens[j + 2]);
}
}
Datum<double[]> datum = new Datum<>(x);
datum.name = tokens[0];
datum.description = tokens[1];
data.add(datum);
}
reader.close();
return data;
}
use of smile.data.Datum in project smile by haifengl.
the class PCLParser method parse.
/**
* Parse a PCL dataset from an input stream.
* @param name the name of dataset.
* @param stream the input stream of data.
* @throws java.io.IOException
*/
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String line = reader.readLine();
if (line == null) {
throw new IOException("Empty data source.");
}
String[] tokens = line.split("\t", -1);
int p = tokens.length - 3;
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
String[] weight = line.split("\t", -1);
if (weight.length != tokens.length) {
throw new IOException("Invalid sample weight header.");
}
Attribute[] attributes = new Attribute[p];
for (int i = 0; i < p; i++) {
attributes[i] = new NumericAttribute(tokens[i + 3], null, Double.valueOf(weight[i + 3]));
}
AttributeDataset data = new AttributeDataset(name, attributes);
for (int i = 3; (line = reader.readLine()) != null; i++) {
tokens = line.split("\t", -1);
if (tokens.length != weight.length) {
throw new IOException(String.format("Invalid number of elements of line %d: %d", i, tokens.length));
}
double[] x = new double[p];
for (int j = 0; j < p; j++) {
if (tokens[j + 3].isEmpty()) {
x[j] = Double.NaN;
} else {
x[j] = Double.valueOf(tokens[j + 3]);
}
}
Datum<double[]> datum = new Datum<>(x);
datum.name = tokens[0];
datum.description = tokens[1];
datum.weight = Double.valueOf(tokens[2]);
data.add(datum);
}
reader.close();
return data;
}
use of smile.data.Datum in project smile by haifengl.
the class TXTParser method parse.
/**
* Parse a TXT dataset from an input stream.
* @param name the name of dataset.
* @param stream the input stream of data.
* @throws java.io.IOException
*/
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String line = reader.readLine();
if (line == null) {
throw new IOException("Empty data source.");
}
String[] tokens = line.split("\t", -1);
int start = 1;
int p = tokens.length - 1;
if (tokens[1].equalsIgnoreCase("description")) {
start = 2;
p = tokens.length - 2;
}
Attribute[] attributes = new Attribute[p];
for (int i = 0; i < p; i++) {
attributes[i] = new NumericAttribute(tokens[i + start]);
}
AttributeDataset data = new AttributeDataset(name, attributes);
for (int i = 2; (line = reader.readLine()) != null; i++) {
tokens = line.split("\t", -1);
if (tokens.length != p + start) {
throw new IOException(String.format("Invalid number of elements of line %d: %d", i, tokens.length));
}
double[] x = new double[p];
for (int j = 0; j < p; j++) {
if (tokens[j + start].isEmpty()) {
x[j] = Double.NaN;
} else {
x[j] = Double.valueOf(tokens[j + start]);
}
}
Datum<double[]> datum = new Datum<>(x);
datum.name = tokens[0];
if (start == 2) {
datum.description = tokens[1];
}
data.add(datum);
}
reader.close();
return data;
}
use of smile.data.Datum in project smile by haifengl.
the class DelimitedTextParser method parse.
/**
* Parse a dataset from a buffered reader.
* @param name the name of dataset.
* @param attributes the list attributes of data in proper order.
* @param reader the buffered reader for data.
* @throws java.io.IOException
*/
private AttributeDataset parse(String name, Attribute[] attributes, BufferedReader reader) throws IOException, ParseException {
String line = reader.readLine();
while (line != null) {
if (line.isEmpty() || line.startsWith(comment)) {
line = reader.readLine();
} else {
break;
}
}
if (line == null) {
throw new IOException("Empty data source.");
}
String[] s = line.split(delimiter, 0);
if (attributes == null) {
int p = s.length;
if (hasRowNames) {
p--;
}
if (responseIndex >= s.length) {
throw new ParseException("Invalid response variable index: " + responseIndex, responseIndex);
}
if (responseIndex >= 0) {
p--;
}
attributes = new Attribute[p];
for (int i = 0; i < p; i++) {
attributes[i] = new NumericAttribute("V" + (i + 1));
}
}
int ncols = attributes.length;
int startColumn = 0;
if (hasRowNames) {
ncols++;
startColumn = 1;
}
if (responseIndex >= 0) {
ncols++;
}
if (ncols != s.length)
throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);
AttributeDataset data = new AttributeDataset(name, attributes, response);
if (hasColumnNames) {
for (int i = startColumn, k = 0; i < s.length; i++) {
if (i != responseIndex) {
attributes[k++].setName(s[i]);
} else {
response.setName(s[i]);
}
}
} else {
String rowName = hasRowNames ? s[0] : null;
double[] x = new double[attributes.length];
double y = Double.NaN;
for (int i = startColumn, k = 0; i < s.length; i++) {
if (i == responseIndex) {
y = response.valueOf(s[i]);
} else if (missing != null && missing.equalsIgnoreCase(s[i])) {
x[k++] = Double.NaN;
} else {
x[k] = attributes[k].valueOf(s[i]);
k++;
}
}
Datum<double[]> datum = new Datum<>(x, y);
datum.name = rowName;
data.add(datum);
}
while ((line = reader.readLine()) != null) {
if (line.isEmpty() || line.startsWith(comment)) {
continue;
}
s = line.split(delimiter, 0);
if (s.length != ncols) {
throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);
}
String rowName = hasRowNames ? s[0] : null;
double[] x = new double[attributes.length];
double y = Double.NaN;
for (int i = startColumn, k = 0; i < s.length; i++) {
if (i == responseIndex) {
y = response.valueOf(s[i]);
} else if (missing != null && missing.equalsIgnoreCase(s[i])) {
x[k++] = Double.NaN;
} else {
x[k] = attributes[k].valueOf(s[i]);
k++;
}
}
Datum<double[]> datum = new Datum<>(x, y);
datum.name = rowName;
data.add(datum);
}
return data;
}
use of smile.data.Datum in project smile by haifengl.
the class FeatureSet method f.
/**
* Returns an attribute dataset with generated features.
* @param data input dataset.
* @return an attribute dataset with generated features
*/
public AttributeDataset f(Dataset<T> data) {
AttributeDataset dataset = new AttributeDataset(data.getName(), attributes(), data.response());
dataset.setDescription(data.getDescription());
for (int i = 0; i < data.size(); i++) {
Datum<T> datum = data.get(i);
Datum<double[]> x = new Datum<>(f(datum.x), datum.y, datum.weight);
x.name = datum.name;
x.description = datum.description;
x.timestamp = datum.timestamp;
dataset.add(x);
}
return dataset;
}
Aggregations