use of smile.data.StringAttribute in project smile by haifengl.
the class ArffParser method parse.
/**
* Parse a dataset from given stream.
*/
public AttributeDataset parse(InputStream stream) throws IOException, ParseException {
try (Reader r = new BufferedReader(new InputStreamReader(stream))) {
StreamTokenizer tokenizer = new StreamTokenizer(r);
initTokenizer(tokenizer);
List<Attribute> attributes = new ArrayList<>();
String relationName = readHeader(tokenizer, attributes);
if (attributes.isEmpty()) {
throw new IOException("no header information available");
}
Attribute response = null;
Attribute[] attr = new Attribute[attributes.size()];
attributes.toArray(attr);
for (int i = 0; i < attributes.size(); i++) {
if (responseIndex == i) {
response = attributes.remove(i);
break;
}
}
AttributeDataset data = new AttributeDataset(relationName, attributes.toArray(new Attribute[attributes.size()]), response);
while (true) {
// Check if end of file reached.
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
break;
}
// Parse instance
if (tokenizer.ttype == '{') {
data.add(getSparseInstance(tokenizer, attr));
} else {
data.add(getInstance(tokenizer, attr));
}
}
for (Attribute attribute : attributes) {
if (attribute instanceof NominalAttribute) {
NominalAttribute a = (NominalAttribute) attribute;
a.setOpen(false);
}
if (attribute instanceof StringAttribute) {
StringAttribute a = (StringAttribute) attribute;
a.setOpen(false);
}
}
return data;
}
}
use of smile.data.StringAttribute in project smile by haifengl.
the class ArffParser method parseAttribute.
/**
* Parses the attribute declaration.
*
* @return an attributes in this relation
* @throws IOException if the information is not read
* successfully
*/
private Attribute parseAttribute(StreamTokenizer tokenizer) throws IOException, ParseException {
Attribute attribute = null;
// Get attribute name.
getNextToken(tokenizer);
String attributeName = tokenizer.sval;
getNextToken(tokenizer);
// Check if attribute is nominal.
if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
// Attribute is real, integer, or string.
if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_REAL) || tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_INTEGER) || tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_NUMERIC)) {
attribute = new NumericAttribute(attributeName);
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_STRING)) {
attribute = new StringAttribute(attributeName);
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_DATE)) {
String format = null;
if (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
if ((tokenizer.ttype != StreamTokenizer.TT_WORD) && (tokenizer.ttype != '\'') && (tokenizer.ttype != '\"')) {
throw new ParseException("not a valid date format", tokenizer.lineno());
}
format = tokenizer.sval;
readTillEOL(tokenizer);
} else {
tokenizer.pushBack();
}
attribute = new DateAttribute(attributeName, null, format);
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_RELATIONAL)) {
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_END_SUBRELATION)) {
getNextToken(tokenizer);
} else {
throw new ParseException("Invalid attribute type or invalid enumeration", tokenizer.lineno());
}
} else {
// Attribute is nominal.
List<String> attributeValues = new ArrayList<>();
tokenizer.pushBack();
// Get values for nominal attribute.
if (tokenizer.nextToken() != '{') {
throw new ParseException("{ expected at beginning of enumeration", tokenizer.lineno());
}
while (tokenizer.nextToken() != '}') {
if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
throw new ParseException("} expected at end of enumeration", tokenizer.lineno());
} else {
attributeValues.add(tokenizer.sval.trim());
}
}
String[] values = new String[attributeValues.size()];
for (int i = 0; i < values.length; i++) {
values[i] = attributeValues.get(i);
}
attribute = new NominalAttribute(attributeName, values);
}
getLastToken(tokenizer, false);
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
throw new ParseException(PREMATURE_END_OF_FILE, tokenizer.lineno());
}
return attribute;
}
Aggregations