Search in sources :

Example 1 with NumericAttribute

use of smile.data.NumericAttribute in project smile by haifengl.

the class GCTParser method parse.

/**
     * Parse a GCT dataset from an input stream.
     * @param name the name of dataset.
     * @param stream the input stream of data.
     * @throws java.io.IOException
     */
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
    String line = reader.readLine();
    if (line == null) {
        throw new IOException("Empty data source.");
    }
    if (!line.equals("#1.2")) {
        throw new IOException("Invalid version.");
    }
    line = reader.readLine();
    if (line == null) {
        throw new IOException("Premature end of file.");
    }
    String[] tokens = line.split("\t", -1);
    if (tokens.length != 2) {
        throw new IOException("Invalid data size inforamation.");
    }
    int n = Integer.parseInt(tokens[0]);
    int p = Integer.parseInt(tokens[1]);
    if (n <= 0 || p <= 0) {
        throw new IOException(String.format("Invalid data size %d x %d.", n, p));
    }
    Attribute[] attributes = new Attribute[p];
    line = reader.readLine();
    if (line == null) {
        throw new IOException("Premature end of file.");
    }
    tokens = line.split("\t", -1);
    if (tokens.length != p + 2) {
        throw new IOException("Invalid title header.");
    }
    for (int i = 0; i < p; i++) {
        attributes[i] = new NumericAttribute(tokens[i + 2]);
    }
    AttributeDataset data = new AttributeDataset(name, attributes);
    for (int i = 0; i < n; i++) {
        line = reader.readLine();
        if (line == null) {
            throw new IOException("Premature end of file.");
        }
        tokens = line.split("\t", -1);
        if (tokens.length != p + 2) {
            throw new IOException(String.format("Invalid number of elements of line %d: %d", i + 4, tokens.length));
        }
        double[] x = new double[p];
        for (int j = 0; j < p; j++) {
            if (tokens[j + 2].isEmpty()) {
                x[j] = Double.NaN;
            } else {
                x[j] = Double.valueOf(tokens[j + 2]);
            }
        }
        Datum<double[]> datum = new Datum<>(x);
        datum.name = tokens[0];
        datum.description = tokens[1];
        data.add(datum);
    }
    reader.close();
    return data;
}
Also used : AttributeDataset(smile.data.AttributeDataset) Datum(smile.data.Datum) InputStreamReader(java.io.InputStreamReader) Attribute(smile.data.Attribute) NumericAttribute(smile.data.NumericAttribute) IOException(java.io.IOException) NumericAttribute(smile.data.NumericAttribute) BufferedReader(java.io.BufferedReader)

Example 2 with NumericAttribute

use of smile.data.NumericAttribute in project smile by haifengl.

the class PCLParser method parse.

/**
     * Parse a PCL dataset from an input stream.
     * @param name the name of dataset.
     * @param stream the input stream of data.
     * @throws java.io.IOException
     */
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
    String line = reader.readLine();
    if (line == null) {
        throw new IOException("Empty data source.");
    }
    String[] tokens = line.split("\t", -1);
    int p = tokens.length - 3;
    line = reader.readLine();
    if (line == null) {
        throw new IOException("Premature end of file.");
    }
    String[] weight = line.split("\t", -1);
    if (weight.length != tokens.length) {
        throw new IOException("Invalid sample weight header.");
    }
    Attribute[] attributes = new Attribute[p];
    for (int i = 0; i < p; i++) {
        attributes[i] = new NumericAttribute(tokens[i + 3], null, Double.valueOf(weight[i + 3]));
    }
    AttributeDataset data = new AttributeDataset(name, attributes);
    for (int i = 3; (line = reader.readLine()) != null; i++) {
        tokens = line.split("\t", -1);
        if (tokens.length != weight.length) {
            throw new IOException(String.format("Invalid number of elements of line %d: %d", i, tokens.length));
        }
        double[] x = new double[p];
        for (int j = 0; j < p; j++) {
            if (tokens[j + 3].isEmpty()) {
                x[j] = Double.NaN;
            } else {
                x[j] = Double.valueOf(tokens[j + 3]);
            }
        }
        Datum<double[]> datum = new Datum<>(x);
        datum.name = tokens[0];
        datum.description = tokens[1];
        datum.weight = Double.valueOf(tokens[2]);
        data.add(datum);
    }
    reader.close();
    return data;
}
Also used : AttributeDataset(smile.data.AttributeDataset) Datum(smile.data.Datum) InputStreamReader(java.io.InputStreamReader) Attribute(smile.data.Attribute) NumericAttribute(smile.data.NumericAttribute) IOException(java.io.IOException) NumericAttribute(smile.data.NumericAttribute) BufferedReader(java.io.BufferedReader)

Example 3 with NumericAttribute

use of smile.data.NumericAttribute in project smile by haifengl.

the class TXTParser method parse.

/**
     * Parse a TXT dataset from an input stream.
     * @param name the name of dataset.
     * @param stream the input stream of data.
     * @throws java.io.IOException
     */
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
    String line = reader.readLine();
    if (line == null) {
        throw new IOException("Empty data source.");
    }
    String[] tokens = line.split("\t", -1);
    int start = 1;
    int p = tokens.length - 1;
    if (tokens[1].equalsIgnoreCase("description")) {
        start = 2;
        p = tokens.length - 2;
    }
    Attribute[] attributes = new Attribute[p];
    for (int i = 0; i < p; i++) {
        attributes[i] = new NumericAttribute(tokens[i + start]);
    }
    AttributeDataset data = new AttributeDataset(name, attributes);
    for (int i = 2; (line = reader.readLine()) != null; i++) {
        tokens = line.split("\t", -1);
        if (tokens.length != p + start) {
            throw new IOException(String.format("Invalid number of elements of line %d: %d", i, tokens.length));
        }
        double[] x = new double[p];
        for (int j = 0; j < p; j++) {
            if (tokens[j + start].isEmpty()) {
                x[j] = Double.NaN;
            } else {
                x[j] = Double.valueOf(tokens[j + start]);
            }
        }
        Datum<double[]> datum = new Datum<>(x);
        datum.name = tokens[0];
        if (start == 2) {
            datum.description = tokens[1];
        }
        data.add(datum);
    }
    reader.close();
    return data;
}
Also used : AttributeDataset(smile.data.AttributeDataset) Datum(smile.data.Datum) InputStreamReader(java.io.InputStreamReader) Attribute(smile.data.Attribute) NumericAttribute(smile.data.NumericAttribute) IOException(java.io.IOException) NumericAttribute(smile.data.NumericAttribute) BufferedReader(java.io.BufferedReader)

Example 4 with NumericAttribute

use of smile.data.NumericAttribute in project smile by haifengl.

the class DelimitedTextParser method parse.

/**
     * Parse a dataset from a buffered reader.
     * @param name the name of dataset.
     * @param attributes the list attributes of data in proper order.
     * @param reader the buffered reader for data.
     * @throws java.io.IOException
     */
private AttributeDataset parse(String name, Attribute[] attributes, BufferedReader reader) throws IOException, ParseException {
    String line = reader.readLine();
    while (line != null) {
        if (line.isEmpty() || line.startsWith(comment)) {
            line = reader.readLine();
        } else {
            break;
        }
    }
    if (line == null) {
        throw new IOException("Empty data source.");
    }
    String[] s = line.split(delimiter, 0);
    if (attributes == null) {
        int p = s.length;
        if (hasRowNames) {
            p--;
        }
        if (responseIndex >= s.length) {
            throw new ParseException("Invalid response variable index: " + responseIndex, responseIndex);
        }
        if (responseIndex >= 0) {
            p--;
        }
        attributes = new Attribute[p];
        for (int i = 0; i < p; i++) {
            attributes[i] = new NumericAttribute("V" + (i + 1));
        }
    }
    int ncols = attributes.length;
    int startColumn = 0;
    if (hasRowNames) {
        ncols++;
        startColumn = 1;
    }
    if (responseIndex >= 0) {
        ncols++;
    }
    if (ncols != s.length)
        throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);
    AttributeDataset data = new AttributeDataset(name, attributes, response);
    if (hasColumnNames) {
        for (int i = startColumn, k = 0; i < s.length; i++) {
            if (i != responseIndex) {
                attributes[k++].setName(s[i]);
            } else {
                response.setName(s[i]);
            }
        }
    } else {
        String rowName = hasRowNames ? s[0] : null;
        double[] x = new double[attributes.length];
        double y = Double.NaN;
        for (int i = startColumn, k = 0; i < s.length; i++) {
            if (i == responseIndex) {
                y = response.valueOf(s[i]);
            } else if (missing != null && missing.equalsIgnoreCase(s[i])) {
                x[k++] = Double.NaN;
            } else {
                x[k] = attributes[k].valueOf(s[i]);
                k++;
            }
        }
        Datum<double[]> datum = new Datum<>(x, y);
        datum.name = rowName;
        data.add(datum);
    }
    while ((line = reader.readLine()) != null) {
        if (line.isEmpty() || line.startsWith(comment)) {
            continue;
        }
        s = line.split(delimiter, 0);
        if (s.length != ncols) {
            throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);
        }
        String rowName = hasRowNames ? s[0] : null;
        double[] x = new double[attributes.length];
        double y = Double.NaN;
        for (int i = startColumn, k = 0; i < s.length; i++) {
            if (i == responseIndex) {
                y = response.valueOf(s[i]);
            } else if (missing != null && missing.equalsIgnoreCase(s[i])) {
                x[k++] = Double.NaN;
            } else {
                x[k] = attributes[k].valueOf(s[i]);
                k++;
            }
        }
        Datum<double[]> datum = new Datum<>(x, y);
        datum.name = rowName;
        data.add(datum);
    }
    return data;
}
Also used : AttributeDataset(smile.data.AttributeDataset) Datum(smile.data.Datum) IOException(java.io.IOException) ParseException(java.text.ParseException) NumericAttribute(smile.data.NumericAttribute)

Example 5 with NumericAttribute

use of smile.data.NumericAttribute in project smile by haifengl.

the class ArffParser method parseAttribute.

/**
     * Parses the attribute declaration.
     *
     * @return an attributes in this relation
     * @throws IOException 	if the information is not read
     * 				successfully
     */
private Attribute parseAttribute(StreamTokenizer tokenizer) throws IOException, ParseException {
    Attribute attribute = null;
    // Get attribute name.
    getNextToken(tokenizer);
    String attributeName = tokenizer.sval;
    getNextToken(tokenizer);
    // Check if attribute is nominal.
    if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
        // Attribute is real, integer, or string.
        if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_REAL) || tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_INTEGER) || tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_NUMERIC)) {
            attribute = new NumericAttribute(attributeName);
            readTillEOL(tokenizer);
        } else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_STRING)) {
            attribute = new StringAttribute(attributeName);
            readTillEOL(tokenizer);
        } else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_DATE)) {
            String format = null;
            if (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
                if ((tokenizer.ttype != StreamTokenizer.TT_WORD) && (tokenizer.ttype != '\'') && (tokenizer.ttype != '\"')) {
                    throw new ParseException("not a valid date format", tokenizer.lineno());
                }
                format = tokenizer.sval;
                readTillEOL(tokenizer);
            } else {
                tokenizer.pushBack();
            }
            attribute = new DateAttribute(attributeName, null, format);
            readTillEOL(tokenizer);
        } else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_RELATIONAL)) {
            readTillEOL(tokenizer);
        } else if (tokenizer.sval.equalsIgnoreCase(ARFF_END_SUBRELATION)) {
            getNextToken(tokenizer);
        } else {
            throw new ParseException("Invalid attribute type or invalid enumeration", tokenizer.lineno());
        }
    } else {
        // Attribute is nominal.
        List<String> attributeValues = new ArrayList<>();
        tokenizer.pushBack();
        // Get values for nominal attribute.
        if (tokenizer.nextToken() != '{') {
            throw new ParseException("{ expected at beginning of enumeration", tokenizer.lineno());
        }
        while (tokenizer.nextToken() != '}') {
            if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
                throw new ParseException("} expected at end of enumeration", tokenizer.lineno());
            } else {
                attributeValues.add(tokenizer.sval.trim());
            }
        }
        String[] values = new String[attributeValues.size()];
        for (int i = 0; i < values.length; i++) {
            values[i] = attributeValues.get(i);
        }
        attribute = new NominalAttribute(attributeName, values);
    }
    getLastToken(tokenizer, false);
    getFirstToken(tokenizer);
    if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
        throw new ParseException(PREMATURE_END_OF_FILE, tokenizer.lineno());
    }
    return attribute;
}
Also used : NominalAttribute(smile.data.NominalAttribute) Attribute(smile.data.Attribute) NominalAttribute(smile.data.NominalAttribute) NumericAttribute(smile.data.NumericAttribute) DateAttribute(smile.data.DateAttribute) StringAttribute(smile.data.StringAttribute) StringAttribute(smile.data.StringAttribute) ArrayList(java.util.ArrayList) ParseException(java.text.ParseException) NumericAttribute(smile.data.NumericAttribute) DateAttribute(smile.data.DateAttribute)

Aggregations

NumericAttribute (smile.data.NumericAttribute)6 IOException (java.io.IOException)5 Attribute (smile.data.Attribute)5 AttributeDataset (smile.data.AttributeDataset)5 Datum (smile.data.Datum)5 BufferedReader (java.io.BufferedReader)4 InputStreamReader (java.io.InputStreamReader)4 ParseException (java.text.ParseException)2 ArrayList (java.util.ArrayList)1 DateAttribute (smile.data.DateAttribute)1 NominalAttribute (smile.data.NominalAttribute)1 StringAttribute (smile.data.StringAttribute)1