use of org.apache.commons.csv.CSVRecord in project hmftools by hartwigmedical.
the class TreatmentCurator method readEntries.
@NotNull
private List<DrugEntry> readEntries(@NotNull final InputStream mappingInputStream) throws IOException {
final List<DrugEntry> drugEntries = Lists.newArrayList();
final CSVParser parser = CSVParser.parse(mappingInputStream, Charset.defaultCharset(), CSVFormat.DEFAULT.withHeader());
for (final CSVRecord record : parser) {
final String canonicalName = record.get(CANONICAL_DRUG_NAME_CSV_FIELD).trim();
final String drugType = record.get(DRUG_TYPE_CSV_FIELD).trim();
final String otherNamesString = record.get(OTHER_DRUG_NAMES_CSV_FIELD).trim();
final List<String> drugNames = Lists.newArrayList();
if (!otherNamesString.isEmpty()) {
final CSVParser otherNamesParser = CSVParser.parse(otherNamesString, CSVFormat.DEFAULT);
for (final CSVRecord otherNames : otherNamesParser) {
for (final String name : otherNames) {
drugNames.add(name.trim());
}
}
}
drugEntries.add(ImmutableDrugEntry.of(drugNames, drugType, canonicalName));
}
return drugEntries;
}
use of org.apache.commons.csv.CSVRecord in project kanonizo by kanonizo.
the class Mutation method parseKillMap.
private static void parseKillMap(File kill, TestSuite testSuite) {
CSVParser parser = null;
try {
parser = new CSVParser(new FileReader(kill), CSVFormat.DEFAULT);
for (CSVRecord record : parser.getRecords()) {
if (record.getRecordNumber() == 0) {
continue;
}
int testCase = Integer.parseInt(record.get(0));
int mutantKilled = Integer.parseInt(record.get(1));
TestCase test = testSuite.getOriginalOrdering().get(testCase - 1);
if (!killMap.containsKey(test)) {
killMap.put(test, new ArrayList<Mutant>());
}
killMap.get(test).addAll(getMutants(mutant -> mutant.getMutantId() == mutantKilled));
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
parser.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
use of org.apache.commons.csv.CSVRecord in project dataverse by IQSS.
the class CSVFileReader method readFile.
public int readFile(BufferedReader csvReader, DataTable dataTable, PrintWriter finalOut) throws IOException {
List<DataVariable> variableList = new ArrayList<>();
CSVParser parser = new CSVParser(csvReader, inFormat.withHeader());
Map<String, Integer> headers = parser.getHeaderMap();
int i = 0;
for (String varName : headers.keySet()) {
if (varName == null || varName.isEmpty()) {
// -- L.A. 4.0 alpha 1
throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.invalidHeader"));
}
DataVariable dv = new DataVariable();
dv.setName(varName);
dv.setLabel(varName);
dv.setInvalidRanges(new ArrayList<>());
dv.setSummaryStatistics(new ArrayList<>());
dv.setUnf("UNF:6:NOTCALCULATED");
dv.setCategories(new ArrayList<>());
variableList.add(dv);
dv.setTypeCharacter();
dv.setIntervalDiscrete();
dv.setFileOrder(i);
dv.setDataTable(dataTable);
i++;
}
dataTable.setVarQuantity((long) variableList.size());
dataTable.setDataVariables(variableList);
boolean[] isNumericVariable = new boolean[headers.size()];
boolean[] isIntegerVariable = new boolean[headers.size()];
boolean[] isTimeVariable = new boolean[headers.size()];
boolean[] isDateVariable = new boolean[headers.size()];
for (i = 0; i < headers.size(); i++) {
// OK, let's assume that every variable is numeric;
// but we'll go through the file and examine every value; the
// moment we find a value that's not a legit numeric one, we'll
// assume that it is in fact a String.
isNumericVariable[i] = true;
isIntegerVariable[i] = true;
isDateVariable[i] = true;
isTimeVariable[i] = true;
}
// First, "learning" pass.
// (we'll save the incoming stream in another temp file:)
SimpleDateFormat[] selectedDateTimeFormat = new SimpleDateFormat[headers.size()];
SimpleDateFormat[] selectedDateFormat = new SimpleDateFormat[headers.size()];
File firstPassTempFile = File.createTempFile("firstpass-", ".csv");
try (CSVPrinter csvFilePrinter = new CSVPrinter(// TODO allow other parsers of tabular data to use this parser by changin inFormat
new FileWriter(firstPassTempFile.getAbsolutePath()), inFormat)) {
// Write headers
csvFilePrinter.printRecord(headers.keySet());
for (CSVRecord record : parser.getRecords()) {
// Checks if #records = #columns in header
if (!record.isConsistent()) {
List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1), "" + headers.size(), "" + record.size() });
throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
}
for (i = 0; i < headers.size(); i++) {
String varString = record.get(i);
isIntegerVariable[i] = isIntegerVariable[i] && varString != null && (varString.isEmpty() || varString.equals("null") || (firstNumCharSet.contains(varString.charAt(0)) && StringUtils.isNumeric(varString.substring(1))));
if (isNumericVariable[i]) {
// If variable might be "numeric" test to see if this value is a parsable number:
if (varString != null && !varString.isEmpty()) {
boolean isNumeric = false;
boolean isInteger = false;
if (varString.equalsIgnoreCase("NaN") || varString.equalsIgnoreCase("NA") || varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf") || varString.equalsIgnoreCase("-Inf") || varString.equalsIgnoreCase("null")) {
continue;
} else {
try {
Double testDoubleValue = new Double(varString);
continue;
} catch (NumberFormatException ex) {
// the token failed to parse as a double
// so the column is a string variable.
}
}
isNumericVariable[i] = false;
}
}
// by parsing the cell as a date or date-time value:
if (!isNumericVariable[i]) {
Date dateResult = null;
if (isTimeVariable[i]) {
if (varString != null && !varString.isEmpty()) {
boolean isTime = false;
if (selectedDateTimeFormat[i] != null) {
ParsePosition pos = new ParsePosition(0);
dateResult = selectedDateTimeFormat[i].parse(varString, pos);
if (dateResult != null && pos.getIndex() == varString.length()) {
// OK, successfully parsed a value!
isTime = true;
}
} else {
for (SimpleDateFormat format : TIME_FORMATS) {
ParsePosition pos = new ParsePosition(0);
dateResult = format.parse(varString, pos);
if (dateResult != null && pos.getIndex() == varString.length()) {
// OK, successfully parsed a value!
isTime = true;
selectedDateTimeFormat[i] = format;
break;
}
}
}
if (!isTime) {
isTimeVariable[i] = false;
// if the token didn't parse as a time value,
// we will still try to parse it as a date, below.
// unless this column is NOT a date.
} else {
// And if it is a time value, we are going to assume it's
// NOT a date.
isDateVariable[i] = false;
}
}
}
if (isDateVariable[i]) {
if (varString != null && !varString.isEmpty()) {
boolean isDate = false;
// -- L.A. 4.0 beta
for (SimpleDateFormat format : DATE_FORMATS) {
// Strict parsing - it will throw an
// exception if it doesn't parse!
format.setLenient(false);
try {
format.parse(varString);
isDate = true;
selectedDateFormat[i] = format;
break;
} catch (ParseException ex) {
// Do nothing
}
}
isDateVariable[i] = isDate;
}
}
}
}
csvFilePrinter.printRecord(record);
}
}
dataTable.setCaseQuantity(parser.getRecordNumber());
parser.close();
csvReader.close();
// Re-type the variables that we've determined are numerics:
for (i = 0; i < headers.size(); i++) {
if (isNumericVariable[i]) {
dataTable.getDataVariables().get(i).setTypeNumeric();
if (isIntegerVariable[i]) {
dataTable.getDataVariables().get(i).setIntervalDiscrete();
} else {
dataTable.getDataVariables().get(i).setIntervalContinuous();
}
} else if (isDateVariable[i] && selectedDateFormat[i] != null) {
// Dates are still Strings, i.e., they are "character" and "discrete";
// But we add special format values for them:
dataTable.getDataVariables().get(i).setFormat(DATE_FORMATS[0].toPattern());
dataTable.getDataVariables().get(i).setFormatCategory("date");
} else if (isTimeVariable[i] && selectedDateTimeFormat[i] != null) {
// Same for time values:
dataTable.getDataVariables().get(i).setFormat(selectedDateTimeFormat[i].toPattern());
dataTable.getDataVariables().get(i).setFormatCategory("time");
}
}
// Second, final pass.
try (BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile))) {
parser = new CSVParser(secondPassReader, inFormat.withHeader());
String[] caseRow = new String[headers.size()];
for (CSVRecord record : parser) {
if (!record.isConsistent()) {
List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1), "" + headers.size(), "" + record.size() });
throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
}
for (i = 0; i < headers.size(); i++) {
String varString = record.get(i);
if (isNumericVariable[i]) {
if (varString == null || varString.isEmpty() || varString.equalsIgnoreCase("NA")) {
// Missing value - represented as an empty string in
// the final tab file
caseRow[i] = "";
} else if (varString.equalsIgnoreCase("NaN")) {
// "Not a Number" special value:
caseRow[i] = "NaN";
} else if (varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf")) {
// Positive infinity:
caseRow[i] = "Inf";
} else if (varString.equalsIgnoreCase("-Inf")) {
// Negative infinity:
caseRow[i] = "-Inf";
} else if (varString.equalsIgnoreCase("null")) {
// By request from Gus - "NULL" is recognized as a
// numeric zero:
caseRow[i] = isIntegerVariable[i] ? "0" : "0.0";
} else {
/* No re-formatting is done on any other numeric values.
* We'll save them as they were, for archival purposes.
* The alternative solution - formatting in sci. notation
* is commented-out below.
*/
caseRow[i] = varString;
/*
if (isIntegerVariable[i]) {
try {
Integer testIntegerValue = new Integer(varString);
caseRow[i] = testIntegerValue.toString();
} catch (NumberFormatException ex) {
throw new IOException("Failed to parse a value recognized as an integer in the first pass! (?)");
}
} else {
try {
Double testDoubleValue = new Double(varString);
if (testDoubleValue.equals(0.0)) {
caseRow[i] = "0.0";
} else {
// One possible implementation:
//
// Round our fractional values to 15 digits
// (minimum number of digits of precision guaranteed by
// type Double) and format the resulting representations
// in a IEEE 754-like "scientific notation" - for ex.,
// 753.24 will be encoded as 7.5324e2
BigDecimal testBigDecimal = new BigDecimal(varString, doubleMathContext);
caseRow[i] = String.format(FORMAT_IEEE754, testBigDecimal);
// Strip meaningless zeros and extra + signs:
caseRow[i] = caseRow[i].replaceFirst("00*e", "e");
caseRow[i] = caseRow[i].replaceFirst("\\.e", ".0e");
caseRow[i] = caseRow[i].replaceFirst("e\\+00", "");
caseRow[i] = caseRow[i].replaceFirst("^\\+", "");
}
} catch (NumberFormatException ex) {
throw new IOException("Failed to parse a value recognized as numeric in the first pass! (?)");
}
}
*/
}
} else if (isTimeVariable[i] || isDateVariable[i]) {
// Time and Dates are stored NOT quoted (don't ask).
if (varString != null) {
// Dealing with quotes:
// remove the leading and trailing quotes, if present:
varString = varString.replaceFirst("^\"*", "");
varString = varString.replaceFirst("\"*$", "");
caseRow[i] = varString;
} else {
caseRow[i] = "";
}
} else {
// indeed empty strings, and NOT missing values:
if (varString != null) {
// escape the quotes, newlines, and tabs:
varString = varString.replace("\"", "\\\"");
varString = varString.replace("\n", "\\n");
varString = varString.replace("\t", "\\t");
// final pair of quotes:
varString = "\"" + varString + "\"";
caseRow[i] = varString;
} else {
caseRow[i] = "\"\"";
}
}
}
finalOut.println(StringUtils.join(caseRow, "\t"));
}
}
long linecount = parser.getRecordNumber();
finalOut.close();
parser.close();
dbglog.fine("Tmp File: " + firstPassTempFile);
// Firstpass file is deleted to prevent tmp from filling up.
firstPassTempFile.delete();
if (dataTable.getCaseQuantity().intValue() != linecount) {
List<String> args = Arrays.asList(new String[] { "" + dataTable.getCaseQuantity().intValue(), "" + linecount });
throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.line_mismatch", args));
}
return (int) linecount;
}
use of org.apache.commons.csv.CSVRecord in project jaqy by Teradata.
the class CSVUtils method getSchemaInfo.
public static SchemaInfo getSchemaInfo(String[] headers, Iterator<CSVRecord> iterator, String[] naValues, boolean precise, long limit) {
int count = -1;
ScanColumnType[] columns = null;
int rowCount = 0;
boolean autoStop = false;
if (limit < 0) {
limit = Long.MAX_VALUE;
autoStop = true;
} else if (limit == 0)
limit = Long.MAX_VALUE;
boolean needScan;
while (iterator.hasNext() && rowCount < limit) {
CSVRecord record = iterator.next();
++rowCount;
int size = record.size();
needScan = false;
if (count == -1) {
count = size;
columns = new ScanColumnType[count];
for (int i = 0; i < count; ++i) {
columns[i] = new ScanColumnType();
columns[i].type = Types.NULL;
columns[i].nullable = false;
columns[i].minLength = Integer.MAX_VALUE;
columns[i].maxLength = -1;
}
needScan = true;
}
for (int i = 0; i < count; ++i) {
String s = record.get(i);
boolean isNa = false;
if (naValues != null) {
for (String na : naValues) {
if (s.equals(na)) {
isNa = true;
break;
}
}
}
if (isNa) {
columns[i].nullable = true;
} else {
int len = s.length();
if (columns[i].maxLength < len)
columns[i].maxLength = len;
if (columns[i].minLength > len)
columns[i].minLength = len;
if (columns[i].type == Types.NUMERIC || columns[i].type == Types.NULL) {
try {
BigDecimal dec = new BigDecimal(s);
int precision = dec.precision();
int scale = dec.scale();
// if precision is smaller than or equal to scale, then we have leading "0."
if (precision <= scale)
precision = scale + 1;
if (columns[i].type == Types.NULL) {
columns[i].type = Types.NUMERIC;
columns[i].precision = precision;
columns[i].scale = scale;
} else {
if (columns[i].scale != scale) {
columns[i].scale = Integer.MAX_VALUE;
}
if (columns[i].precision < precision) {
columns[i].precision = precision;
}
}
++columns[i].notNullCount;
} catch (Exception ex) {
if (columns[i].minLength == columns[i].maxLength) {
// Check if we are in a fixed char column.
columns[i].type = Types.CHAR;
++columns[i].notNullCount;
} else {
columns[i].type = Types.VARCHAR;
// For varchar columns, we basically have to scan
// all the rows to find the maximum string length.
autoStop = false;
}
}
} else if (columns[i].type == Types.CHAR) {
if (columns[i].minLength == columns[i].maxLength)
++columns[i].notNullCount;
else {
columns[i].type = Types.VARCHAR;
// For varchar columns, we basically have to scan
// all the rows to find the maximum string length.
autoStop = false;
}
}
}
if (autoStop && columns[i].notNullCount < AUTO_STOP_MINIMUM) {
// For each number column, we basically need enough
// confidence to say that additional scan is not
// necessary.
needScan = true;
}
}
if (autoStop && !needScan) {
// Automatically stop if we just have numbers.
break;
}
}
if (rowCount == 0)
return null;
FullColumnInfo[] columnInfos = new FullColumnInfo[count];
for (int i = 0; i < count; ++i) {
columnInfos[i] = new FullColumnInfo();
if (headers != null) {
columnInfos[i].name = headers[i];
}
if (columnInfos[i].name == null || columnInfos[i].name.trim().length() == 0) {
columnInfos[i].name = "col" + (i + 1);
}
columnInfos[i].label = columnInfos[i].name;
columnInfos[i].nullable = columns[i].nullable ? ResultSetMetaData.columnNullable : ResultSetMetaData.columnNoNulls;
if (columns[i].type == Types.CHAR || columns[i].type == Types.VARCHAR) {
columnInfos[i].type = columns[i].type;
columnInfos[i].precision = columns[i].maxLength;
} else {
columnInfos[i].precision = columns[i].precision;
if (columns[i].scale == Integer.MAX_VALUE) {
columnInfos[i].type = Types.DOUBLE;
columnInfos[i].scale = 0;
} else if (columns[i].scale <= 0 && columns[i].precision < 11) {
columnInfos[i].type = Types.INTEGER;
columnInfos[i].scale = 0;
} else if (precise && columns[i].scale > 0) {
columnInfos[i].type = Types.DECIMAL;
columnInfos[i].scale = columns[i].scale;
} else {
columnInfos[i].type = Types.DOUBLE;
columnInfos[i].scale = 0;
}
}
}
return new SchemaInfo(columnInfos);
}
use of org.apache.commons.csv.CSVRecord in project lumberjack by fn-ctional.
the class WebBackend method multipartFileToRecords.
private Iterable<CSVRecord> multipartFileToRecords(MultipartFile csv) throws FileUploadException {
try {
File file = new File(csv.getOriginalFilename());
csv.transferTo(file);
Reader in = new FileReader(file);
Iterable<CSVRecord> records = CSVFormat.DEFAULT.withHeader(HEADERS).withFirstRecordAsHeader().parse(in);
return records;
} catch (IOException e) {
throw new FileUploadException();
}
}
Aggregations