use of org.pentaho.reporting.libraries.base.util.CSVTokenizer in project data-access by pentaho.
the class CsvUtils method getDataProfile.
private DataProfile getDataProfile(CsvFileInfo fileInfo, int rowLimit, String fileLocation, int fileType, String encoding) throws Exception {
DataProfile result = new DataProfile();
String line = null;
int row = 0;
List<List<String>> headerSample = new ArrayList<List<String>>();
List<List<String>> dataSample = new ArrayList<List<String>>(rowLimit);
int maxColumns = 0;
InputStreamReader reader = null;
try {
InputStream inputStream = new FileInputStream(fileLocation);
UnicodeBOMInputStream bomIs = new UnicodeBOMInputStream(inputStream);
reader = new InputStreamReader(bomIs, encoding);
bomIs.skipBOM();
// read each line of text file
StringBuilder stringBuilder = new StringBuilder(1000);
line = TextFileInput.getLine(null, reader, fileType, stringBuilder);
while (line != null && row < rowLimit) {
CSVTokenizer csvt = new CSVTokenizer(line, fileInfo.getDelimiter(), fileInfo.getEnclosure());
List<String> rowData = new ArrayList<String>();
int count = 0;
while (csvt.hasMoreTokens()) {
String token = csvt.nextToken();
if (token != null) {
token = token.trim();
}
rowData.add(token);
count++;
}
if (maxColumns < count) {
maxColumns = count;
}
if (row < fileInfo.getHeaderRows()) {
headerSample.add(rowData);
} else {
dataSample.add(rowData);
}
line = TextFileInput.getLine(null, reader, fileType, stringBuilder);
row++;
}
} catch (IllegalArgumentException iae) {
// $NON-NLS-1$
Logger.error(getClass().getSimpleName(), "There was an issue parsing the CSV file", iae);
throw new CsvParseException(row + 1, line);
} catch (Exception e) {
// $NON-NLS-1$
Logger.error(getClass().getSimpleName(), "Could not read CSV", e);
throw e;
} finally {
// close the file
try {
if (reader != null) {
reader.close();
}
} catch (Exception e) {
throw e;
// ignore
}
}
String[][] headerValues = new String[headerSample.size()][maxColumns];
int rowNo = 0;
for (List<String> values : headerSample) {
int colNo = 0;
for (String value : values) {
headerValues[rowNo][colNo] = value;
colNo++;
}
rowNo++;
}
int[] fieldLengths = new int[maxColumns];
String[][] dataValues = new String[dataSample.size()][maxColumns];
DataRow[] data = new DataRow[dataSample.size()];
rowNo = 0;
for (List<String> values : dataSample) {
int colNo = 0;
for (String value : values) {
dataValues[rowNo][colNo] = value;
int currentMaxLength = fieldLengths[colNo];
if (value.length() > currentMaxLength) {
fieldLengths[colNo] = value.length();
}
colNo++;
}
data[rowNo] = new DataRow();
data[rowNo].setCells(dataValues[rowNo]);
rowNo++;
}
result.setRows(data);
// $NON-NLS-1$
DecimalFormat df = new DecimalFormat("000");
ColumnInfo[] profiles = new ColumnInfo[maxColumns];
for (int idx = 0; idx < maxColumns; idx++) {
ColumnInfo profile = new ColumnInfo();
profiles[idx] = profile;
String title = CsvFileInfo.DEFAULT_COLUMN_NAME_PREFIX + df.format(idx + 1);
// $NON-NLS-1$
String colId = "PC_" + idx;
if (headerValues.length > 0) {
if (headerValues[headerValues.length - 1][idx] != null) {
title = headerValues[headerValues.length - 1][idx];
colId = title;
if (!Util.validateId(title)) {
colId = Util.toId(colId);
}
}
}
profile.setTitle(title);
profile.setId(colId);
List<String> samples = getColumnData(idx, dataValues);
assumeColumnDetails(profile, samples);
}
result.setColumns(profiles);
return result;
}
use of org.pentaho.reporting.libraries.base.util.CSVTokenizer in project data-access by pentaho.
the class CsvFileInfo method parseSampleContents.
public List<List<String>> parseSampleContents() {
String delim = getDelimiter();
if (contents == null) {
// $NON-NLS-1$
throw new IllegalStateException("Sample Contents is null, nothing to parse");
} else if (delim == null || "".equals(delim)) {
// $NON-NLS-1$
// use a random delimiter that will result in an un-parsed list
delim = "~!@#$%";
}
List<List<String>> sample = new ArrayList<List<String>>();
CSVTokenizer csvTokenizer;
String enclosure = null;
if (!"".equals(getEnclosure())) {
enclosure = getEnclosure();
}
for (String line : contents) {
csvTokenizer = new CSVTokenizer(line, delim, enclosure);
List<String> rowData = new ArrayList<String>();
while (csvTokenizer.hasMoreTokens()) {
// get next token and store it in the list
rowData.add(csvTokenizer.nextToken());
}
sample.add(rowData);
}
return sample;
}
use of org.pentaho.reporting.libraries.base.util.CSVTokenizer in project pentaho-kettle by pentaho.
the class TextFileInputIT method testGetLine_FILE_FORMAT_MIXED.
/**
* Verify that lines are properly identified when parsing a mixed format file.
*/
public void testGetLine_FILE_FORMAT_MIXED() throws Exception {
String fileLocation = "src/it/resources/example.csv";
InputStream inputStream = KettleVFS.getInputStream(fileLocation);
InputStreamReader reader = new InputStreamReader(inputStream);
// Grab the first line and verify it only has 4 tokens instead of 24 (the total tokens in the file)
StringBuilder stringBuilder = new StringBuilder(1000);
String line = TextFileInput.getLine(null, reader, TextFileInputMeta.FILE_FORMAT_MIXED, stringBuilder);
CSVTokenizer csvt = new CSVTokenizer(line, ",", "\"");
assertEquals(4, csvt.countTokens());
}
Aggregations