Search in sources :

Example 76 with CSVRecord

use of org.apache.commons.csv.CSVRecord in project hmftools by hartwigmedical.

the class TreatmentCurator method readEntries.

@NotNull
private List<DrugEntry> readEntries(@NotNull final InputStream mappingInputStream) throws IOException {
    final List<DrugEntry> drugEntries = Lists.newArrayList();
    final CSVParser parser = CSVParser.parse(mappingInputStream, Charset.defaultCharset(), CSVFormat.DEFAULT.withHeader());
    for (final CSVRecord record : parser) {
        final String canonicalName = record.get(CANONICAL_DRUG_NAME_CSV_FIELD).trim();
        final String drugType = record.get(DRUG_TYPE_CSV_FIELD).trim();
        final String otherNamesString = record.get(OTHER_DRUG_NAMES_CSV_FIELD).trim();
        final List<String> drugNames = Lists.newArrayList();
        if (!otherNamesString.isEmpty()) {
            final CSVParser otherNamesParser = CSVParser.parse(otherNamesString, CSVFormat.DEFAULT);
            for (final CSVRecord otherNames : otherNamesParser) {
                for (final String name : otherNames) {
                    drugNames.add(name.trim());
                }
            }
        }
        drugEntries.add(ImmutableDrugEntry.of(drugNames, drugType, canonicalName));
    }
    return drugEntries;
}
Also used : CSVParser(org.apache.commons.csv.CSVParser) CSVRecord(org.apache.commons.csv.CSVRecord) NotNull(org.jetbrains.annotations.NotNull)

Example 77 with CSVRecord

use of org.apache.commons.csv.CSVRecord in project kanonizo by kanonizo.

the class Mutation method parseKillMap.

private static void parseKillMap(File kill, TestSuite testSuite) {
    CSVParser parser = null;
    try {
        parser = new CSVParser(new FileReader(kill), CSVFormat.DEFAULT);
        for (CSVRecord record : parser.getRecords()) {
            if (record.getRecordNumber() == 0) {
                continue;
            }
            int testCase = Integer.parseInt(record.get(0));
            int mutantKilled = Integer.parseInt(record.get(1));
            TestCase test = testSuite.getOriginalOrdering().get(testCase - 1);
            if (!killMap.containsKey(test)) {
                killMap.put(test, new ArrayList<Mutant>());
            }
            killMap.get(test).addAll(getMutants(mutant -> mutant.getMutantId() == mutantKilled));
        }
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        try {
            parser.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
Also used : TestSuite(org.kanonizo.framework.objects.TestSuite) Predicate(java.util.function.Predicate) CSVRecord(org.apache.commons.csv.CSVRecord) Scanner(java.util.Scanner) IOException(java.io.IOException) HashMap(java.util.HashMap) Parameter(com.scythe.instrumenter.InstrumentationProperties.Parameter) Collectors(java.util.stream.Collectors) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) TestCase(org.kanonizo.framework.objects.TestCase) List(java.util.List) Util(org.kanonizo.util.Util) CSVFormat(org.apache.commons.csv.CSVFormat) ClassAnalyzer(com.scythe.instrumenter.analysis.ClassAnalyzer) Map(java.util.Map) CSVParser(org.apache.commons.csv.CSVParser) FileReader(java.io.FileReader) TestCase(org.kanonizo.framework.objects.TestCase) CSVParser(org.apache.commons.csv.CSVParser) FileNotFoundException(java.io.FileNotFoundException) FileReader(java.io.FileReader) CSVRecord(org.apache.commons.csv.CSVRecord) IOException(java.io.IOException)

Example 78 with CSVRecord

use of org.apache.commons.csv.CSVRecord in project dataverse by IQSS.

the class CSVFileReader method readFile.

public int readFile(BufferedReader csvReader, DataTable dataTable, PrintWriter finalOut) throws IOException {
    List<DataVariable> variableList = new ArrayList<>();
    CSVParser parser = new CSVParser(csvReader, inFormat.withHeader());
    Map<String, Integer> headers = parser.getHeaderMap();
    int i = 0;
    for (String varName : headers.keySet()) {
        if (varName == null || varName.isEmpty()) {
            // -- L.A. 4.0 alpha 1
            throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.invalidHeader"));
        }
        DataVariable dv = new DataVariable();
        dv.setName(varName);
        dv.setLabel(varName);
        dv.setInvalidRanges(new ArrayList<>());
        dv.setSummaryStatistics(new ArrayList<>());
        dv.setUnf("UNF:6:NOTCALCULATED");
        dv.setCategories(new ArrayList<>());
        variableList.add(dv);
        dv.setTypeCharacter();
        dv.setIntervalDiscrete();
        dv.setFileOrder(i);
        dv.setDataTable(dataTable);
        i++;
    }
    dataTable.setVarQuantity((long) variableList.size());
    dataTable.setDataVariables(variableList);
    boolean[] isNumericVariable = new boolean[headers.size()];
    boolean[] isIntegerVariable = new boolean[headers.size()];
    boolean[] isTimeVariable = new boolean[headers.size()];
    boolean[] isDateVariable = new boolean[headers.size()];
    for (i = 0; i < headers.size(); i++) {
        // OK, let's assume that every variable is numeric;
        // but we'll go through the file and examine every value; the
        // moment we find a value that's not a legit numeric one, we'll
        // assume that it is in fact a String.
        isNumericVariable[i] = true;
        isIntegerVariable[i] = true;
        isDateVariable[i] = true;
        isTimeVariable[i] = true;
    }
    // First, "learning" pass.
    // (we'll save the incoming stream in another temp file:)
    SimpleDateFormat[] selectedDateTimeFormat = new SimpleDateFormat[headers.size()];
    SimpleDateFormat[] selectedDateFormat = new SimpleDateFormat[headers.size()];
    File firstPassTempFile = File.createTempFile("firstpass-", ".csv");
    try (CSVPrinter csvFilePrinter = new CSVPrinter(// TODO allow other parsers of tabular data to use this parser by changin inFormat
    new FileWriter(firstPassTempFile.getAbsolutePath()), inFormat)) {
        // Write  headers
        csvFilePrinter.printRecord(headers.keySet());
        for (CSVRecord record : parser.getRecords()) {
            // Checks if #records = #columns in header
            if (!record.isConsistent()) {
                List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1), "" + headers.size(), "" + record.size() });
                throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
            }
            for (i = 0; i < headers.size(); i++) {
                String varString = record.get(i);
                isIntegerVariable[i] = isIntegerVariable[i] && varString != null && (varString.isEmpty() || varString.equals("null") || (firstNumCharSet.contains(varString.charAt(0)) && StringUtils.isNumeric(varString.substring(1))));
                if (isNumericVariable[i]) {
                    // If variable might be "numeric" test to see if this value is a parsable number:
                    if (varString != null && !varString.isEmpty()) {
                        boolean isNumeric = false;
                        boolean isInteger = false;
                        if (varString.equalsIgnoreCase("NaN") || varString.equalsIgnoreCase("NA") || varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf") || varString.equalsIgnoreCase("-Inf") || varString.equalsIgnoreCase("null")) {
                            continue;
                        } else {
                            try {
                                Double testDoubleValue = new Double(varString);
                                continue;
                            } catch (NumberFormatException ex) {
                            // the token failed to parse as a double
                            // so the column is a string variable.
                            }
                        }
                        isNumericVariable[i] = false;
                    }
                }
                // by parsing the cell as a date or date-time value:
                if (!isNumericVariable[i]) {
                    Date dateResult = null;
                    if (isTimeVariable[i]) {
                        if (varString != null && !varString.isEmpty()) {
                            boolean isTime = false;
                            if (selectedDateTimeFormat[i] != null) {
                                ParsePosition pos = new ParsePosition(0);
                                dateResult = selectedDateTimeFormat[i].parse(varString, pos);
                                if (dateResult != null && pos.getIndex() == varString.length()) {
                                    // OK, successfully parsed a value!
                                    isTime = true;
                                }
                            } else {
                                for (SimpleDateFormat format : TIME_FORMATS) {
                                    ParsePosition pos = new ParsePosition(0);
                                    dateResult = format.parse(varString, pos);
                                    if (dateResult != null && pos.getIndex() == varString.length()) {
                                        // OK, successfully parsed a value!
                                        isTime = true;
                                        selectedDateTimeFormat[i] = format;
                                        break;
                                    }
                                }
                            }
                            if (!isTime) {
                                isTimeVariable[i] = false;
                            // if the token didn't parse as a time value,
                            // we will still try to parse it as a date, below.
                            // unless this column is NOT a date.
                            } else {
                                // And if it is a time value, we are going to assume it's
                                // NOT a date.
                                isDateVariable[i] = false;
                            }
                        }
                    }
                    if (isDateVariable[i]) {
                        if (varString != null && !varString.isEmpty()) {
                            boolean isDate = false;
                            // -- L.A. 4.0 beta
                            for (SimpleDateFormat format : DATE_FORMATS) {
                                // Strict parsing - it will throw an
                                // exception if it doesn't parse!
                                format.setLenient(false);
                                try {
                                    format.parse(varString);
                                    isDate = true;
                                    selectedDateFormat[i] = format;
                                    break;
                                } catch (ParseException ex) {
                                // Do nothing
                                }
                            }
                            isDateVariable[i] = isDate;
                        }
                    }
                }
            }
            csvFilePrinter.printRecord(record);
        }
    }
    dataTable.setCaseQuantity(parser.getRecordNumber());
    parser.close();
    csvReader.close();
    // Re-type the variables that we've determined are numerics:
    for (i = 0; i < headers.size(); i++) {
        if (isNumericVariable[i]) {
            dataTable.getDataVariables().get(i).setTypeNumeric();
            if (isIntegerVariable[i]) {
                dataTable.getDataVariables().get(i).setIntervalDiscrete();
            } else {
                dataTable.getDataVariables().get(i).setIntervalContinuous();
            }
        } else if (isDateVariable[i] && selectedDateFormat[i] != null) {
            // Dates are still Strings, i.e., they are "character" and "discrete";
            // But we add special format values for them:
            dataTable.getDataVariables().get(i).setFormat(DATE_FORMATS[0].toPattern());
            dataTable.getDataVariables().get(i).setFormatCategory("date");
        } else if (isTimeVariable[i] && selectedDateTimeFormat[i] != null) {
            // Same for time values:
            dataTable.getDataVariables().get(i).setFormat(selectedDateTimeFormat[i].toPattern());
            dataTable.getDataVariables().get(i).setFormatCategory("time");
        }
    }
    // Second, final pass.
    try (BufferedReader secondPassReader = new BufferedReader(new FileReader(firstPassTempFile))) {
        parser = new CSVParser(secondPassReader, inFormat.withHeader());
        String[] caseRow = new String[headers.size()];
        for (CSVRecord record : parser) {
            if (!record.isConsistent()) {
                List<String> args = Arrays.asList(new String[] { "" + (parser.getCurrentLineNumber() - 1), "" + headers.size(), "" + record.size() });
                throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.recordMismatch", args));
            }
            for (i = 0; i < headers.size(); i++) {
                String varString = record.get(i);
                if (isNumericVariable[i]) {
                    if (varString == null || varString.isEmpty() || varString.equalsIgnoreCase("NA")) {
                        // Missing value - represented as an empty string in
                        // the final tab file
                        caseRow[i] = "";
                    } else if (varString.equalsIgnoreCase("NaN")) {
                        // "Not a Number" special value:
                        caseRow[i] = "NaN";
                    } else if (varString.equalsIgnoreCase("Inf") || varString.equalsIgnoreCase("+Inf")) {
                        // Positive infinity:
                        caseRow[i] = "Inf";
                    } else if (varString.equalsIgnoreCase("-Inf")) {
                        // Negative infinity:
                        caseRow[i] = "-Inf";
                    } else if (varString.equalsIgnoreCase("null")) {
                        // By request from Gus - "NULL" is recognized as a
                        // numeric zero:
                        caseRow[i] = isIntegerVariable[i] ? "0" : "0.0";
                    } else {
                        /* No re-formatting is done on any other numeric values.
                             * We'll save them as they were, for archival purposes.
                             * The alternative solution - formatting in sci. notation
                             * is commented-out below.
                             */
                        caseRow[i] = varString;
                    /*
                             if (isIntegerVariable[i]) {
                                try {
                                    Integer testIntegerValue = new Integer(varString);
                                    caseRow[i] = testIntegerValue.toString();
                                } catch (NumberFormatException ex) {
                                    throw new IOException("Failed to parse a value recognized as an integer in the first pass! (?)");
                                }
                            } else {
                                try {
                                    Double testDoubleValue = new Double(varString);
                                    if (testDoubleValue.equals(0.0)) {
                                        caseRow[i] = "0.0";
                                    } else {
                                                                            // One possible implementation:
                                        //
                                        // Round our fractional values to 15 digits
                                        // (minimum number of digits of precision guaranteed by
                                        // type Double) and format the resulting representations
                                        // in a IEEE 754-like "scientific notation" - for ex.,
                                        // 753.24 will be encoded as 7.5324e2
                                        BigDecimal testBigDecimal = new BigDecimal(varString, doubleMathContext);
                                        caseRow[i] = String.format(FORMAT_IEEE754, testBigDecimal);

                                        // Strip meaningless zeros and extra + signs:
                                        caseRow[i] = caseRow[i].replaceFirst("00*e", "e");
                                        caseRow[i] = caseRow[i].replaceFirst("\\.e", ".0e");
                                        caseRow[i] = caseRow[i].replaceFirst("e\\+00", "");
                                        caseRow[i] = caseRow[i].replaceFirst("^\\+", "");
                                    }
                                } catch (NumberFormatException ex) {
                                    throw new IOException("Failed to parse a value recognized as numeric in the first pass! (?)");
                                }
                            }
                             */
                    }
                } else if (isTimeVariable[i] || isDateVariable[i]) {
                    // Time and Dates are stored NOT quoted (don't ask).
                    if (varString != null) {
                        // Dealing with quotes:
                        // remove the leading and trailing quotes, if present:
                        varString = varString.replaceFirst("^\"*", "");
                        varString = varString.replaceFirst("\"*$", "");
                        caseRow[i] = varString;
                    } else {
                        caseRow[i] = "";
                    }
                } else {
                    // indeed empty strings, and NOT missing values:
                    if (varString != null) {
                        // escape the quotes, newlines, and tabs:
                        varString = varString.replace("\"", "\\\"");
                        varString = varString.replace("\n", "\\n");
                        varString = varString.replace("\t", "\\t");
                        // final pair of quotes:
                        varString = "\"" + varString + "\"";
                        caseRow[i] = varString;
                    } else {
                        caseRow[i] = "\"\"";
                    }
                }
            }
            finalOut.println(StringUtils.join(caseRow, "\t"));
        }
    }
    long linecount = parser.getRecordNumber();
    finalOut.close();
    parser.close();
    dbglog.fine("Tmp File: " + firstPassTempFile);
    // Firstpass file is deleted to prevent tmp from filling up.
    firstPassTempFile.delete();
    if (dataTable.getCaseQuantity().intValue() != linecount) {
        List<String> args = Arrays.asList(new String[] { "" + dataTable.getCaseQuantity().intValue(), "" + linecount });
        throw new IOException(BundleUtil.getStringFromBundle("ingest.csv.line_mismatch", args));
    }
    return (int) linecount;
}
Also used : FileWriter(java.io.FileWriter) ArrayList(java.util.ArrayList) DataVariable(edu.harvard.iq.dataverse.datavariable.DataVariable) CSVPrinter(org.apache.commons.csv.CSVPrinter) TabularDataFileReader(edu.harvard.iq.dataverse.ingest.tabulardata.TabularDataFileReader) FileReader(java.io.FileReader) ParsePosition(java.text.ParsePosition) IOException(java.io.IOException) Date(java.util.Date) CSVParser(org.apache.commons.csv.CSVParser) BufferedReader(java.io.BufferedReader) CSVRecord(org.apache.commons.csv.CSVRecord) ParseException(java.text.ParseException) SimpleDateFormat(java.text.SimpleDateFormat) File(java.io.File)

Example 79 with CSVRecord

use of org.apache.commons.csv.CSVRecord in project jaqy by Teradata.

the class CSVUtils method getSchemaInfo.

public static SchemaInfo getSchemaInfo(String[] headers, Iterator<CSVRecord> iterator, String[] naValues, boolean precise, long limit) {
    int count = -1;
    ScanColumnType[] columns = null;
    int rowCount = 0;
    boolean autoStop = false;
    if (limit < 0) {
        limit = Long.MAX_VALUE;
        autoStop = true;
    } else if (limit == 0)
        limit = Long.MAX_VALUE;
    boolean needScan;
    while (iterator.hasNext() && rowCount < limit) {
        CSVRecord record = iterator.next();
        ++rowCount;
        int size = record.size();
        needScan = false;
        if (count == -1) {
            count = size;
            columns = new ScanColumnType[count];
            for (int i = 0; i < count; ++i) {
                columns[i] = new ScanColumnType();
                columns[i].type = Types.NULL;
                columns[i].nullable = false;
                columns[i].minLength = Integer.MAX_VALUE;
                columns[i].maxLength = -1;
            }
            needScan = true;
        }
        for (int i = 0; i < count; ++i) {
            String s = record.get(i);
            boolean isNa = false;
            if (naValues != null) {
                for (String na : naValues) {
                    if (s.equals(na)) {
                        isNa = true;
                        break;
                    }
                }
            }
            if (isNa) {
                columns[i].nullable = true;
            } else {
                int len = s.length();
                if (columns[i].maxLength < len)
                    columns[i].maxLength = len;
                if (columns[i].minLength > len)
                    columns[i].minLength = len;
                if (columns[i].type == Types.NUMERIC || columns[i].type == Types.NULL) {
                    try {
                        BigDecimal dec = new BigDecimal(s);
                        int precision = dec.precision();
                        int scale = dec.scale();
                        // if precision is smaller than or equal to scale, then we have leading "0."
                        if (precision <= scale)
                            precision = scale + 1;
                        if (columns[i].type == Types.NULL) {
                            columns[i].type = Types.NUMERIC;
                            columns[i].precision = precision;
                            columns[i].scale = scale;
                        } else {
                            if (columns[i].scale != scale) {
                                columns[i].scale = Integer.MAX_VALUE;
                            }
                            if (columns[i].precision < precision) {
                                columns[i].precision = precision;
                            }
                        }
                        ++columns[i].notNullCount;
                    } catch (Exception ex) {
                        if (columns[i].minLength == columns[i].maxLength) {
                            // Check if we are in a fixed char column.
                            columns[i].type = Types.CHAR;
                            ++columns[i].notNullCount;
                        } else {
                            columns[i].type = Types.VARCHAR;
                            // For varchar columns, we basically have to scan
                            // all the rows to find the maximum string length.
                            autoStop = false;
                        }
                    }
                } else if (columns[i].type == Types.CHAR) {
                    if (columns[i].minLength == columns[i].maxLength)
                        ++columns[i].notNullCount;
                    else {
                        columns[i].type = Types.VARCHAR;
                        // For varchar columns, we basically have to scan
                        // all the rows to find the maximum string length.
                        autoStop = false;
                    }
                }
            }
            if (autoStop && columns[i].notNullCount < AUTO_STOP_MINIMUM) {
                // For each number column, we basically need enough
                // confidence to say that additional scan is not
                // necessary.
                needScan = true;
            }
        }
        if (autoStop && !needScan) {
            // Automatically stop if we just have numbers.
            break;
        }
    }
    if (rowCount == 0)
        return null;
    FullColumnInfo[] columnInfos = new FullColumnInfo[count];
    for (int i = 0; i < count; ++i) {
        columnInfos[i] = new FullColumnInfo();
        if (headers != null) {
            columnInfos[i].name = headers[i];
        }
        if (columnInfos[i].name == null || columnInfos[i].name.trim().length() == 0) {
            columnInfos[i].name = "col" + (i + 1);
        }
        columnInfos[i].label = columnInfos[i].name;
        columnInfos[i].nullable = columns[i].nullable ? ResultSetMetaData.columnNullable : ResultSetMetaData.columnNoNulls;
        if (columns[i].type == Types.CHAR || columns[i].type == Types.VARCHAR) {
            columnInfos[i].type = columns[i].type;
            columnInfos[i].precision = columns[i].maxLength;
        } else {
            columnInfos[i].precision = columns[i].precision;
            if (columns[i].scale == Integer.MAX_VALUE) {
                columnInfos[i].type = Types.DOUBLE;
                columnInfos[i].scale = 0;
            } else if (columns[i].scale <= 0 && columns[i].precision < 11) {
                columnInfos[i].type = Types.INTEGER;
                columnInfos[i].scale = 0;
            } else if (precise && columns[i].scale > 0) {
                columnInfos[i].type = Types.DECIMAL;
                columnInfos[i].scale = columns[i].scale;
            } else {
                columnInfos[i].type = Types.DOUBLE;
                columnInfos[i].scale = 0;
            }
        }
    }
    return new SchemaInfo(columnInfos);
}
Also used : FullColumnInfo(com.teradata.jaqy.schema.FullColumnInfo) CSVRecord(org.apache.commons.csv.CSVRecord) BigDecimal(java.math.BigDecimal) SchemaInfo(com.teradata.jaqy.schema.SchemaInfo)

Example 80 with CSVRecord

use of org.apache.commons.csv.CSVRecord in project lumberjack by fn-ctional.

the class WebBackend method multipartFileToRecords.

private Iterable<CSVRecord> multipartFileToRecords(MultipartFile csv) throws FileUploadException {
    try {
        File file = new File(csv.getOriginalFilename());
        csv.transferTo(file);
        Reader in = new FileReader(file);
        Iterable<CSVRecord> records = CSVFormat.DEFAULT.withHeader(HEADERS).withFirstRecordAsHeader().parse(in);
        return records;
    } catch (IOException e) {
        throw new FileUploadException();
    }
}
Also used : Reader(java.io.Reader) FileReader(java.io.FileReader) FileReader(java.io.FileReader) CSVRecord(org.apache.commons.csv.CSVRecord) IOException(java.io.IOException) File(java.io.File) MultipartFile(org.springframework.web.multipart.MultipartFile) FileUploadException(uk.ac.bris.cs.rfideasalreadytaken.lumberjack.exceptions.FileUploadException)

Aggregations

CSVRecord (org.apache.commons.csv.CSVRecord)127 CSVParser (org.apache.commons.csv.CSVParser)71 IOException (java.io.IOException)40 CSVFormat (org.apache.commons.csv.CSVFormat)40 ArrayList (java.util.ArrayList)36 Reader (java.io.Reader)24 StringReader (java.io.StringReader)22 InputStreamReader (java.io.InputStreamReader)18 FileReader (java.io.FileReader)16 Test (org.junit.Test)14 Path (java.nio.file.Path)13 HashMap (java.util.HashMap)11 File (java.io.File)10 PreparedStatement (java.sql.PreparedStatement)10 InputStream (java.io.InputStream)9 ResultSet (java.sql.ResultSet)9 PhoenixConnection (org.apache.phoenix.jdbc.PhoenixConnection)9 CSVCommonsLoader (org.apache.phoenix.util.CSVCommonsLoader)9 BufferedReader (java.io.BufferedReader)8 Map (java.util.Map)7