Search in sources :

Example 1 with CSVTokenizer

use of org.pentaho.reporting.libraries.base.util.CSVTokenizer in project data-access by pentaho.

the class CsvUtils method getDataProfile.

private DataProfile getDataProfile(CsvFileInfo fileInfo, int rowLimit, String fileLocation, int fileType, String encoding) throws Exception {
    DataProfile result = new DataProfile();
    String line = null;
    int row = 0;
    List<List<String>> headerSample = new ArrayList<List<String>>();
    List<List<String>> dataSample = new ArrayList<List<String>>(rowLimit);
    int maxColumns = 0;
    InputStreamReader reader = null;
    try {
        InputStream inputStream = new FileInputStream(fileLocation);
        UnicodeBOMInputStream bomIs = new UnicodeBOMInputStream(inputStream);
        reader = new InputStreamReader(bomIs, encoding);
        bomIs.skipBOM();
        // read each line of text file
        StringBuilder stringBuilder = new StringBuilder(1000);
        line = TextFileInput.getLine(null, reader, fileType, stringBuilder);
        while (line != null && row < rowLimit) {
            CSVTokenizer csvt = new CSVTokenizer(line, fileInfo.getDelimiter(), fileInfo.getEnclosure());
            List<String> rowData = new ArrayList<String>();
            int count = 0;
            while (csvt.hasMoreTokens()) {
                String token = csvt.nextToken();
                if (token != null) {
                    token = token.trim();
                }
                rowData.add(token);
                count++;
            }
            if (maxColumns < count) {
                maxColumns = count;
            }
            if (row < fileInfo.getHeaderRows()) {
                headerSample.add(rowData);
            } else {
                dataSample.add(rowData);
            }
            line = TextFileInput.getLine(null, reader, fileType, stringBuilder);
            row++;
        }
    } catch (IllegalArgumentException iae) {
        // $NON-NLS-1$
        Logger.error(getClass().getSimpleName(), "There was an issue parsing the CSV file", iae);
        throw new CsvParseException(row + 1, line);
    } catch (Exception e) {
        // $NON-NLS-1$
        Logger.error(getClass().getSimpleName(), "Could not read CSV", e);
        throw e;
    } finally {
        // close the file
        try {
            if (reader != null) {
                reader.close();
            }
        } catch (Exception e) {
            throw e;
        // ignore
        }
    }
    String[][] headerValues = new String[headerSample.size()][maxColumns];
    int rowNo = 0;
    for (List<String> values : headerSample) {
        int colNo = 0;
        for (String value : values) {
            headerValues[rowNo][colNo] = value;
            colNo++;
        }
        rowNo++;
    }
    int[] fieldLengths = new int[maxColumns];
    String[][] dataValues = new String[dataSample.size()][maxColumns];
    DataRow[] data = new DataRow[dataSample.size()];
    rowNo = 0;
    for (List<String> values : dataSample) {
        int colNo = 0;
        for (String value : values) {
            dataValues[rowNo][colNo] = value;
            int currentMaxLength = fieldLengths[colNo];
            if (value.length() > currentMaxLength) {
                fieldLengths[colNo] = value.length();
            }
            colNo++;
        }
        data[rowNo] = new DataRow();
        data[rowNo].setCells(dataValues[rowNo]);
        rowNo++;
    }
    result.setRows(data);
    // $NON-NLS-1$
    DecimalFormat df = new DecimalFormat("000");
    ColumnInfo[] profiles = new ColumnInfo[maxColumns];
    for (int idx = 0; idx < maxColumns; idx++) {
        ColumnInfo profile = new ColumnInfo();
        profiles[idx] = profile;
        String title = CsvFileInfo.DEFAULT_COLUMN_NAME_PREFIX + df.format(idx + 1);
        // $NON-NLS-1$
        String colId = "PC_" + idx;
        if (headerValues.length > 0) {
            if (headerValues[headerValues.length - 1][idx] != null) {
                title = headerValues[headerValues.length - 1][idx];
                colId = title;
                if (!Util.validateId(title)) {
                    colId = Util.toId(colId);
                }
            }
        }
        profile.setTitle(title);
        profile.setId(colId);
        List<String> samples = getColumnData(idx, dataValues);
        assumeColumnDetails(profile, samples);
    }
    result.setColumns(profiles);
    return result;
}
Also used : InputStreamReader(java.io.InputStreamReader) CsvParseException(org.pentaho.platform.dataaccess.datasource.wizard.models.CsvParseException) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) DecimalFormat(java.text.DecimalFormat) ArrayList(java.util.ArrayList) ColumnInfo(org.pentaho.platform.dataaccess.datasource.wizard.models.ColumnInfo) CSVTokenizer(org.pentaho.reporting.libraries.base.util.CSVTokenizer) DataRow(org.pentaho.platform.dataaccess.datasource.wizard.models.DataRow) FileInputStream(java.io.FileInputStream) IOException(java.io.IOException) CsvParseException(org.pentaho.platform.dataaccess.datasource.wizard.models.CsvParseException) FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) List(java.util.List)

Example 2 with CSVTokenizer

use of org.pentaho.reporting.libraries.base.util.CSVTokenizer in project data-access by pentaho.

the class CsvFileInfo method parseSampleContents.

public List<List<String>> parseSampleContents() {
    String delim = getDelimiter();
    if (contents == null) {
        // $NON-NLS-1$
        throw new IllegalStateException("Sample Contents is null, nothing to parse");
    } else if (delim == null || "".equals(delim)) {
        // $NON-NLS-1$
        // use a random delimiter that will result in an un-parsed list
        delim = "~!@#$%";
    }
    List<List<String>> sample = new ArrayList<List<String>>();
    CSVTokenizer csvTokenizer;
    String enclosure = null;
    if (!"".equals(getEnclosure())) {
        enclosure = getEnclosure();
    }
    for (String line : contents) {
        csvTokenizer = new CSVTokenizer(line, delim, enclosure);
        List<String> rowData = new ArrayList<String>();
        while (csvTokenizer.hasMoreTokens()) {
            // get next token and store it in the list
            rowData.add(csvTokenizer.nextToken());
        }
        sample.add(rowData);
    }
    return sample;
}
Also used : ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList) CSVTokenizer(org.pentaho.reporting.libraries.base.util.CSVTokenizer)

Example 3 with CSVTokenizer

use of org.pentaho.reporting.libraries.base.util.CSVTokenizer in project pentaho-kettle by pentaho.

the class TextFileInputIT method testGetLine_FILE_FORMAT_MIXED.

/**
 * Verify that lines are properly identified when parsing a mixed format file.
 */
public void testGetLine_FILE_FORMAT_MIXED() throws Exception {
    String fileLocation = "src/it/resources/example.csv";
    InputStream inputStream = KettleVFS.getInputStream(fileLocation);
    InputStreamReader reader = new InputStreamReader(inputStream);
    // Grab the first line and verify it only has 4 tokens instead of 24 (the total tokens in the file)
    StringBuilder stringBuilder = new StringBuilder(1000);
    String line = TextFileInput.getLine(null, reader, TextFileInputMeta.FILE_FORMAT_MIXED, stringBuilder);
    CSVTokenizer csvt = new CSVTokenizer(line, ",", "\"");
    assertEquals(4, csvt.countTokens());
}
Also used : InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) CSVTokenizer(org.pentaho.reporting.libraries.base.util.CSVTokenizer) ValueMetaString(org.pentaho.di.core.row.value.ValueMetaString)

Aggregations

CSVTokenizer (org.pentaho.reporting.libraries.base.util.CSVTokenizer)3 InputStream (java.io.InputStream)2 InputStreamReader (java.io.InputStreamReader)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 FileInputStream (java.io.FileInputStream)1 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 DecimalFormat (java.text.DecimalFormat)1 ValueMetaString (org.pentaho.di.core.row.value.ValueMetaString)1 ColumnInfo (org.pentaho.platform.dataaccess.datasource.wizard.models.ColumnInfo)1 CsvParseException (org.pentaho.platform.dataaccess.datasource.wizard.models.CsvParseException)1 DataRow (org.pentaho.platform.dataaccess.datasource.wizard.models.DataRow)1