use of smile.data.NumericAttribute in project smile by haifengl.
the class GCTParser method parse.
/**
* Parse a GCT dataset from an input stream.
* @param name the name of dataset.
* @param stream the input stream of data.
* @throws java.io.IOException
*/
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String line = reader.readLine();
if (line == null) {
throw new IOException("Empty data source.");
}
if (!line.equals("#1.2")) {
throw new IOException("Invalid version.");
}
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
String[] tokens = line.split("\t", -1);
if (tokens.length != 2) {
throw new IOException("Invalid data size inforamation.");
}
int n = Integer.parseInt(tokens[0]);
int p = Integer.parseInt(tokens[1]);
if (n <= 0 || p <= 0) {
throw new IOException(String.format("Invalid data size %d x %d.", n, p));
}
Attribute[] attributes = new Attribute[p];
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
tokens = line.split("\t", -1);
if (tokens.length != p + 2) {
throw new IOException("Invalid title header.");
}
for (int i = 0; i < p; i++) {
attributes[i] = new NumericAttribute(tokens[i + 2]);
}
AttributeDataset data = new AttributeDataset(name, attributes);
for (int i = 0; i < n; i++) {
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
tokens = line.split("\t", -1);
if (tokens.length != p + 2) {
throw new IOException(String.format("Invalid number of elements of line %d: %d", i + 4, tokens.length));
}
double[] x = new double[p];
for (int j = 0; j < p; j++) {
if (tokens[j + 2].isEmpty()) {
x[j] = Double.NaN;
} else {
x[j] = Double.valueOf(tokens[j + 2]);
}
}
Datum<double[]> datum = new Datum<>(x);
datum.name = tokens[0];
datum.description = tokens[1];
data.add(datum);
}
reader.close();
return data;
}
use of smile.data.NumericAttribute in project smile by haifengl.
the class PCLParser method parse.
/**
* Parse a PCL dataset from an input stream.
* @param name the name of dataset.
* @param stream the input stream of data.
* @throws java.io.IOException
*/
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String line = reader.readLine();
if (line == null) {
throw new IOException("Empty data source.");
}
String[] tokens = line.split("\t", -1);
int p = tokens.length - 3;
line = reader.readLine();
if (line == null) {
throw new IOException("Premature end of file.");
}
String[] weight = line.split("\t", -1);
if (weight.length != tokens.length) {
throw new IOException("Invalid sample weight header.");
}
Attribute[] attributes = new Attribute[p];
for (int i = 0; i < p; i++) {
attributes[i] = new NumericAttribute(tokens[i + 3], null, Double.valueOf(weight[i + 3]));
}
AttributeDataset data = new AttributeDataset(name, attributes);
for (int i = 3; (line = reader.readLine()) != null; i++) {
tokens = line.split("\t", -1);
if (tokens.length != weight.length) {
throw new IOException(String.format("Invalid number of elements of line %d: %d", i, tokens.length));
}
double[] x = new double[p];
for (int j = 0; j < p; j++) {
if (tokens[j + 3].isEmpty()) {
x[j] = Double.NaN;
} else {
x[j] = Double.valueOf(tokens[j + 3]);
}
}
Datum<double[]> datum = new Datum<>(x);
datum.name = tokens[0];
datum.description = tokens[1];
datum.weight = Double.valueOf(tokens[2]);
data.add(datum);
}
reader.close();
return data;
}
use of smile.data.NumericAttribute in project smile by haifengl.
the class TXTParser method parse.
/**
* Parse a TXT dataset from an input stream.
* @param name the name of dataset.
* @param stream the input stream of data.
* @throws java.io.IOException
*/
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
String line = reader.readLine();
if (line == null) {
throw new IOException("Empty data source.");
}
String[] tokens = line.split("\t", -1);
int start = 1;
int p = tokens.length - 1;
if (tokens[1].equalsIgnoreCase("description")) {
start = 2;
p = tokens.length - 2;
}
Attribute[] attributes = new Attribute[p];
for (int i = 0; i < p; i++) {
attributes[i] = new NumericAttribute(tokens[i + start]);
}
AttributeDataset data = new AttributeDataset(name, attributes);
for (int i = 2; (line = reader.readLine()) != null; i++) {
tokens = line.split("\t", -1);
if (tokens.length != p + start) {
throw new IOException(String.format("Invalid number of elements of line %d: %d", i, tokens.length));
}
double[] x = new double[p];
for (int j = 0; j < p; j++) {
if (tokens[j + start].isEmpty()) {
x[j] = Double.NaN;
} else {
x[j] = Double.valueOf(tokens[j + start]);
}
}
Datum<double[]> datum = new Datum<>(x);
datum.name = tokens[0];
if (start == 2) {
datum.description = tokens[1];
}
data.add(datum);
}
reader.close();
return data;
}
use of smile.data.NumericAttribute in project smile by haifengl.
the class DelimitedTextParser method parse.
/**
* Parse a dataset from a buffered reader.
* @param name the name of dataset.
* @param attributes the list attributes of data in proper order.
* @param reader the buffered reader for data.
* @throws java.io.IOException
*/
private AttributeDataset parse(String name, Attribute[] attributes, BufferedReader reader) throws IOException, ParseException {
String line = reader.readLine();
while (line != null) {
if (line.isEmpty() || line.startsWith(comment)) {
line = reader.readLine();
} else {
break;
}
}
if (line == null) {
throw new IOException("Empty data source.");
}
String[] s = line.split(delimiter, 0);
if (attributes == null) {
int p = s.length;
if (hasRowNames) {
p--;
}
if (responseIndex >= s.length) {
throw new ParseException("Invalid response variable index: " + responseIndex, responseIndex);
}
if (responseIndex >= 0) {
p--;
}
attributes = new Attribute[p];
for (int i = 0; i < p; i++) {
attributes[i] = new NumericAttribute("V" + (i + 1));
}
}
int ncols = attributes.length;
int startColumn = 0;
if (hasRowNames) {
ncols++;
startColumn = 1;
}
if (responseIndex >= 0) {
ncols++;
}
if (ncols != s.length)
throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);
AttributeDataset data = new AttributeDataset(name, attributes, response);
if (hasColumnNames) {
for (int i = startColumn, k = 0; i < s.length; i++) {
if (i != responseIndex) {
attributes[k++].setName(s[i]);
} else {
response.setName(s[i]);
}
}
} else {
String rowName = hasRowNames ? s[0] : null;
double[] x = new double[attributes.length];
double y = Double.NaN;
for (int i = startColumn, k = 0; i < s.length; i++) {
if (i == responseIndex) {
y = response.valueOf(s[i]);
} else if (missing != null && missing.equalsIgnoreCase(s[i])) {
x[k++] = Double.NaN;
} else {
x[k] = attributes[k].valueOf(s[i]);
k++;
}
}
Datum<double[]> datum = new Datum<>(x, y);
datum.name = rowName;
data.add(datum);
}
while ((line = reader.readLine()) != null) {
if (line.isEmpty() || line.startsWith(comment)) {
continue;
}
s = line.split(delimiter, 0);
if (s.length != ncols) {
throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);
}
String rowName = hasRowNames ? s[0] : null;
double[] x = new double[attributes.length];
double y = Double.NaN;
for (int i = startColumn, k = 0; i < s.length; i++) {
if (i == responseIndex) {
y = response.valueOf(s[i]);
} else if (missing != null && missing.equalsIgnoreCase(s[i])) {
x[k++] = Double.NaN;
} else {
x[k] = attributes[k].valueOf(s[i]);
k++;
}
}
Datum<double[]> datum = new Datum<>(x, y);
datum.name = rowName;
data.add(datum);
}
return data;
}
use of smile.data.NumericAttribute in project smile by haifengl.
the class ArffParser method parseAttribute.
/**
* Parses the attribute declaration.
*
* @return an attributes in this relation
* @throws IOException if the information is not read
* successfully
*/
private Attribute parseAttribute(StreamTokenizer tokenizer) throws IOException, ParseException {
Attribute attribute = null;
// Get attribute name.
getNextToken(tokenizer);
String attributeName = tokenizer.sval;
getNextToken(tokenizer);
// Check if attribute is nominal.
if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
// Attribute is real, integer, or string.
if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_REAL) || tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_INTEGER) || tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_NUMERIC)) {
attribute = new NumericAttribute(attributeName);
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_STRING)) {
attribute = new StringAttribute(attributeName);
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_DATE)) {
String format = null;
if (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
if ((tokenizer.ttype != StreamTokenizer.TT_WORD) && (tokenizer.ttype != '\'') && (tokenizer.ttype != '\"')) {
throw new ParseException("not a valid date format", tokenizer.lineno());
}
format = tokenizer.sval;
readTillEOL(tokenizer);
} else {
tokenizer.pushBack();
}
attribute = new DateAttribute(attributeName, null, format);
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_RELATIONAL)) {
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_END_SUBRELATION)) {
getNextToken(tokenizer);
} else {
throw new ParseException("Invalid attribute type or invalid enumeration", tokenizer.lineno());
}
} else {
// Attribute is nominal.
List<String> attributeValues = new ArrayList<>();
tokenizer.pushBack();
// Get values for nominal attribute.
if (tokenizer.nextToken() != '{') {
throw new ParseException("{ expected at beginning of enumeration", tokenizer.lineno());
}
while (tokenizer.nextToken() != '}') {
if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
throw new ParseException("} expected at end of enumeration", tokenizer.lineno());
} else {
attributeValues.add(tokenizer.sval.trim());
}
}
String[] values = new String[attributeValues.size()];
for (int i = 0; i < values.length; i++) {
values[i] = attributeValues.get(i);
}
attribute = new NominalAttribute(attributeName, values);
}
getLastToken(tokenizer, false);
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
throw new ParseException(PREMATURE_END_OF_FILE, tokenizer.lineno());
}
return attribute;
}
Aggregations