Search in sources :

Example 1 with Datum

use of smile.data.Datum in project smile by haifengl.

the class GCTParser method parse.

/**
     * Parse a GCT dataset from an input stream.
     * @param name the name of dataset.
     * @param stream the input stream of data.
     * @throws java.io.IOException
     */
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
    String line = reader.readLine();
    if (line == null) {
        throw new IOException("Empty data source.");
    }
    if (!line.equals("#1.2")) {
        throw new IOException("Invalid version.");
    }
    line = reader.readLine();
    if (line == null) {
        throw new IOException("Premature end of file.");
    }
    String[] tokens = line.split("\t", -1);
    if (tokens.length != 2) {
        throw new IOException("Invalid data size inforamation.");
    }
    int n = Integer.parseInt(tokens[0]);
    int p = Integer.parseInt(tokens[1]);
    if (n <= 0 || p <= 0) {
        throw new IOException(String.format("Invalid data size %d x %d.", n, p));
    }
    Attribute[] attributes = new Attribute[p];
    line = reader.readLine();
    if (line == null) {
        throw new IOException("Premature end of file.");
    }
    tokens = line.split("\t", -1);
    if (tokens.length != p + 2) {
        throw new IOException("Invalid title header.");
    }
    for (int i = 0; i < p; i++) {
        attributes[i] = new NumericAttribute(tokens[i + 2]);
    }
    AttributeDataset data = new AttributeDataset(name, attributes);
    for (int i = 0; i < n; i++) {
        line = reader.readLine();
        if (line == null) {
            throw new IOException("Premature end of file.");
        }
        tokens = line.split("\t", -1);
        if (tokens.length != p + 2) {
            throw new IOException(String.format("Invalid number of elements of line %d: %d", i + 4, tokens.length));
        }
        double[] x = new double[p];
        for (int j = 0; j < p; j++) {
            if (tokens[j + 2].isEmpty()) {
                x[j] = Double.NaN;
            } else {
                x[j] = Double.valueOf(tokens[j + 2]);
            }
        }
        Datum<double[]> datum = new Datum<>(x);
        datum.name = tokens[0];
        datum.description = tokens[1];
        data.add(datum);
    }
    reader.close();
    return data;
}
Also used : AttributeDataset(smile.data.AttributeDataset) Datum(smile.data.Datum) InputStreamReader(java.io.InputStreamReader) Attribute(smile.data.Attribute) NumericAttribute(smile.data.NumericAttribute) IOException(java.io.IOException) NumericAttribute(smile.data.NumericAttribute) BufferedReader(java.io.BufferedReader)

Example 2 with Datum

use of smile.data.Datum in project smile by haifengl.

the class PCLParser method parse.

/**
     * Parse a PCL dataset from an input stream.
     * @param name the name of dataset.
     * @param stream the input stream of data.
     * @throws java.io.IOException
     */
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
    String line = reader.readLine();
    if (line == null) {
        throw new IOException("Empty data source.");
    }
    String[] tokens = line.split("\t", -1);
    int p = tokens.length - 3;
    line = reader.readLine();
    if (line == null) {
        throw new IOException("Premature end of file.");
    }
    String[] weight = line.split("\t", -1);
    if (weight.length != tokens.length) {
        throw new IOException("Invalid sample weight header.");
    }
    Attribute[] attributes = new Attribute[p];
    for (int i = 0; i < p; i++) {
        attributes[i] = new NumericAttribute(tokens[i + 3], null, Double.valueOf(weight[i + 3]));
    }
    AttributeDataset data = new AttributeDataset(name, attributes);
    for (int i = 3; (line = reader.readLine()) != null; i++) {
        tokens = line.split("\t", -1);
        if (tokens.length != weight.length) {
            throw new IOException(String.format("Invalid number of elements of line %d: %d", i, tokens.length));
        }
        double[] x = new double[p];
        for (int j = 0; j < p; j++) {
            if (tokens[j + 3].isEmpty()) {
                x[j] = Double.NaN;
            } else {
                x[j] = Double.valueOf(tokens[j + 3]);
            }
        }
        Datum<double[]> datum = new Datum<>(x);
        datum.name = tokens[0];
        datum.description = tokens[1];
        datum.weight = Double.valueOf(tokens[2]);
        data.add(datum);
    }
    reader.close();
    return data;
}
Also used : AttributeDataset(smile.data.AttributeDataset) Datum(smile.data.Datum) InputStreamReader(java.io.InputStreamReader) Attribute(smile.data.Attribute) NumericAttribute(smile.data.NumericAttribute) IOException(java.io.IOException) NumericAttribute(smile.data.NumericAttribute) BufferedReader(java.io.BufferedReader)

Example 3 with Datum

use of smile.data.Datum in project smile by haifengl.

the class TXTParser method parse.

/**
     * Parse a TXT dataset from an input stream.
     * @param name the name of dataset.
     * @param stream the input stream of data.
     * @throws java.io.IOException
     */
public AttributeDataset parse(String name, InputStream stream) throws IOException, ParseException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
    String line = reader.readLine();
    if (line == null) {
        throw new IOException("Empty data source.");
    }
    String[] tokens = line.split("\t", -1);
    int start = 1;
    int p = tokens.length - 1;
    if (tokens[1].equalsIgnoreCase("description")) {
        start = 2;
        p = tokens.length - 2;
    }
    Attribute[] attributes = new Attribute[p];
    for (int i = 0; i < p; i++) {
        attributes[i] = new NumericAttribute(tokens[i + start]);
    }
    AttributeDataset data = new AttributeDataset(name, attributes);
    for (int i = 2; (line = reader.readLine()) != null; i++) {
        tokens = line.split("\t", -1);
        if (tokens.length != p + start) {
            throw new IOException(String.format("Invalid number of elements of line %d: %d", i, tokens.length));
        }
        double[] x = new double[p];
        for (int j = 0; j < p; j++) {
            if (tokens[j + start].isEmpty()) {
                x[j] = Double.NaN;
            } else {
                x[j] = Double.valueOf(tokens[j + start]);
            }
        }
        Datum<double[]> datum = new Datum<>(x);
        datum.name = tokens[0];
        if (start == 2) {
            datum.description = tokens[1];
        }
        data.add(datum);
    }
    reader.close();
    return data;
}
Also used : AttributeDataset(smile.data.AttributeDataset) Datum(smile.data.Datum) InputStreamReader(java.io.InputStreamReader) Attribute(smile.data.Attribute) NumericAttribute(smile.data.NumericAttribute) IOException(java.io.IOException) NumericAttribute(smile.data.NumericAttribute) BufferedReader(java.io.BufferedReader)

Example 4 with Datum

use of smile.data.Datum in project smile by haifengl.

the class DelimitedTextParser method parse.

/**
     * Parse a dataset from a buffered reader.
     * @param name the name of dataset.
     * @param attributes the list attributes of data in proper order.
     * @param reader the buffered reader for data.
     * @throws java.io.IOException
     */
private AttributeDataset parse(String name, Attribute[] attributes, BufferedReader reader) throws IOException, ParseException {
    String line = reader.readLine();
    while (line != null) {
        if (line.isEmpty() || line.startsWith(comment)) {
            line = reader.readLine();
        } else {
            break;
        }
    }
    if (line == null) {
        throw new IOException("Empty data source.");
    }
    String[] s = line.split(delimiter, 0);
    if (attributes == null) {
        int p = s.length;
        if (hasRowNames) {
            p--;
        }
        if (responseIndex >= s.length) {
            throw new ParseException("Invalid response variable index: " + responseIndex, responseIndex);
        }
        if (responseIndex >= 0) {
            p--;
        }
        attributes = new Attribute[p];
        for (int i = 0; i < p; i++) {
            attributes[i] = new NumericAttribute("V" + (i + 1));
        }
    }
    int ncols = attributes.length;
    int startColumn = 0;
    if (hasRowNames) {
        ncols++;
        startColumn = 1;
    }
    if (responseIndex >= 0) {
        ncols++;
    }
    if (ncols != s.length)
        throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);
    AttributeDataset data = new AttributeDataset(name, attributes, response);
    if (hasColumnNames) {
        for (int i = startColumn, k = 0; i < s.length; i++) {
            if (i != responseIndex) {
                attributes[k++].setName(s[i]);
            } else {
                response.setName(s[i]);
            }
        }
    } else {
        String rowName = hasRowNames ? s[0] : null;
        double[] x = new double[attributes.length];
        double y = Double.NaN;
        for (int i = startColumn, k = 0; i < s.length; i++) {
            if (i == responseIndex) {
                y = response.valueOf(s[i]);
            } else if (missing != null && missing.equalsIgnoreCase(s[i])) {
                x[k++] = Double.NaN;
            } else {
                x[k] = attributes[k].valueOf(s[i]);
                k++;
            }
        }
        Datum<double[]> datum = new Datum<>(x, y);
        datum.name = rowName;
        data.add(datum);
    }
    while ((line = reader.readLine()) != null) {
        if (line.isEmpty() || line.startsWith(comment)) {
            continue;
        }
        s = line.split(delimiter, 0);
        if (s.length != ncols) {
            throw new ParseException(String.format("%d columns, expected %d", s.length, ncols), s.length);
        }
        String rowName = hasRowNames ? s[0] : null;
        double[] x = new double[attributes.length];
        double y = Double.NaN;
        for (int i = startColumn, k = 0; i < s.length; i++) {
            if (i == responseIndex) {
                y = response.valueOf(s[i]);
            } else if (missing != null && missing.equalsIgnoreCase(s[i])) {
                x[k++] = Double.NaN;
            } else {
                x[k] = attributes[k].valueOf(s[i]);
                k++;
            }
        }
        Datum<double[]> datum = new Datum<>(x, y);
        datum.name = rowName;
        data.add(datum);
    }
    return data;
}
Also used : AttributeDataset(smile.data.AttributeDataset) Datum(smile.data.Datum) IOException(java.io.IOException) ParseException(java.text.ParseException) NumericAttribute(smile.data.NumericAttribute)

Example 5 with Datum

use of smile.data.Datum in project smile by haifengl.

the class FeatureSet method f.

/**
     * Returns an attribute dataset with generated features.
     * @param data input dataset.
     * @return an attribute dataset with generated features 
     */
public AttributeDataset f(Dataset<T> data) {
    AttributeDataset dataset = new AttributeDataset(data.getName(), attributes(), data.response());
    dataset.setDescription(data.getDescription());
    for (int i = 0; i < data.size(); i++) {
        Datum<T> datum = data.get(i);
        Datum<double[]> x = new Datum<>(f(datum.x), datum.y, datum.weight);
        x.name = datum.name;
        x.description = datum.description;
        x.timestamp = datum.timestamp;
        dataset.add(x);
    }
    return dataset;
}
Also used : AttributeDataset(smile.data.AttributeDataset) Datum(smile.data.Datum)

Aggregations

AttributeDataset (smile.data.AttributeDataset)6 Datum (smile.data.Datum)6 IOException (java.io.IOException)5 NumericAttribute (smile.data.NumericAttribute)5 BufferedReader (java.io.BufferedReader)4 InputStreamReader (java.io.InputStreamReader)4 Attribute (smile.data.Attribute)4 ParseException (java.text.ParseException)1