Search in sources :

Example 36 with CSVFormat

use of org.apache.commons.csv.CSVFormat in project pinot by linkedin.

the class CSVRecordReader method getFormat.

private CSVFormat getFormat() {
    CSVFormat format = getFormatFromConfig().withDelimiter(getDelimiterFromConfig());
    String[] header = getHeaderFromConfig();
    if (header != null) {
        format = format.withHeader(header);
    } else {
        format = format.withHeader();
    }
    return format;
}
Also used : CSVFormat(org.apache.commons.csv.CSVFormat)

Example 37 with CSVFormat

use of org.apache.commons.csv.CSVFormat in project camel by apache.

the class CsvRecordConvertersTest method setUp.

@Before
public void setUp() throws Exception {
    CSVFormat format = CSVFormat.DEFAULT.withHeader("A", "B", "C");
    CSVParser parser = new CSVParser(new StringReader("1,2,3"), format);
    List<CSVRecord> records = parser.getRecords();
    record = records.get(0);
}
Also used : CSVParser(org.apache.commons.csv.CSVParser) StringReader(java.io.StringReader) CSVFormat(org.apache.commons.csv.CSVFormat) CSVRecord(org.apache.commons.csv.CSVRecord) Before(org.junit.Before)

Example 38 with CSVFormat

use of org.apache.commons.csv.CSVFormat in project kylo by Teradata.

the class CSVAutoDetect method guessDelimiter.

private Character guessDelimiter(List<LineStats> lineStats, String value, Character quote, boolean headerRow) throws IOException {
    // Assume delimiter exists in first line and compare to subsequent lines
    if (lineStats.size() > 0) {
        LineStats firstLineStat = lineStats.get(0);
        Map<Character, Integer> firstLineDelimCounts = firstLineStat.calcDelimCountsOrdered();
        if (firstLineDelimCounts != null && firstLineDelimCounts.size() > 0) {
            List<Character> candidates = new ArrayList<>();
            // Attempt to parse given delimiter
            Set<Character> firstLineDelimKeys = sortDelimitersIntoPreferredOrder(firstLineDelimCounts.keySet());
            for (Character delim : firstLineDelimKeys) {
                CSVFormat format;
                if (headerRow) {
                    format = CSVFormat.DEFAULT.withFirstRecordAsHeader().withDelimiter(delim).withQuote(quote);
                } else {
                    format = CSVFormat.DEFAULT.withDelimiter(delim).withQuote(quote);
                }
                try (StringReader sr = new StringReader(value)) {
                    try (CSVParser parser = format.parse(sr)) {
                        if (parser.getHeaderMap() != null) {
                            int size = parser.getHeaderMap().size();
                            List<CSVRecord> records = parser.getRecords();
                            boolean match = records.stream().allMatch(record -> record.size() == size);
                            if (match) {
                                return delim;
                            }
                        }
                    }
                }
                Integer delimCount = firstLineDelimCounts.get(delim);
                boolean match = true;
                for (int i = 1; i < lineStats.size() && match; i++) {
                    LineStats thisLine = lineStats.get(i);
                    Integer rowDelimCount = thisLine.delimStats.get(delim);
                    match = delimCount.equals(rowDelimCount);
                }
                if (match) {
                    candidates.add(delim);
                }
            }
            if (candidates.size() > 0) {
                // All agree on a single delimiter
                if (candidates.size() == 1) {
                    return candidates.get(0);
                } else {
                    int count = 0;
                    // Return highest delimiter from candidates
                    for (Character delim : firstLineDelimKeys) {
                        if (candidates.get(count++) != null) {
                            return delim;
                        }
                    }
                }
            }
        }
    }
    return null;
}
Also used : ArrayList(java.util.ArrayList) CSVParser(org.apache.commons.csv.CSVParser) StringReader(java.io.StringReader) CSVFormat(org.apache.commons.csv.CSVFormat) CSVRecord(org.apache.commons.csv.CSVRecord)

Example 39 with CSVFormat

use of org.apache.commons.csv.CSVFormat in project kylo by Teradata.

the class CSVAutoDetect method detectCSVFormat.

/**
 * Parses a sample file to allow schema specification when creating a new feed.
 *
 * @param sampleText the sample text
 * @return A configured parser
 * @throws IOException If there is an error parsing the sample file
 */
public CSVFormat detectCSVFormat(String sampleText, boolean headerRow, String seperatorStr) throws IOException {
    CSVFormat format = CSVFormat.DEFAULT.withAllowMissingColumnNames();
    Character separatorChar = null;
    if (StringUtils.isNotBlank(seperatorStr)) {
        separatorChar = seperatorStr.charAt(0);
    }
    try (BufferedReader br = new BufferedReader(new StringReader(sampleText))) {
        List<LineStats> lineStats = generateStats(br, separatorChar);
        Character quote = guessQuote(lineStats);
        Character delim = guessDelimiter(lineStats, sampleText, quote, headerRow);
        if (delim == null) {
            throw new IOException("Unrecognized format");
        }
        format = format.withDelimiter(delim);
        format = format.withQuoteMode(QuoteMode.MINIMAL).withQuote(quote);
    }
    return format;
}
Also used : BufferedReader(java.io.BufferedReader) StringReader(java.io.StringReader) CSVFormat(org.apache.commons.csv.CSVFormat) IOException(java.io.IOException)

Example 40 with CSVFormat

use of org.apache.commons.csv.CSVFormat in project kylo by Teradata.

the class CSVFileSchemaParser method parse.

@Override
public Schema parse(InputStream is, Charset charset, TableSchemaType target) throws IOException {
    Validate.notNull(target, "target must not be null");
    Validate.notNull(is, "stream must not be null");
    Validate.notNull(charset, "charset must not be null");
    validate();
    // Parse the file
    String sampleData = ParserHelper.extractSampleLines(is, charset, numRowsToSample);
    Validate.notEmpty(sampleData, "No data in file");
    CSVFormat format = createCSVFormat(sampleData);
    try (Reader reader = new StringReader(sampleData)) {
        CSVParser parser = format.parse(reader);
        DefaultFileSchema fileSchema = populateSchema(parser);
        fileSchema.setCharset(charset.name());
        // Convert to target schema with proper derived types
        Schema targetSchema = convertToTarget(target, fileSchema);
        return targetSchema;
    }
}
Also used : CSVParser(org.apache.commons.csv.CSVParser) DefaultFileSchema(com.thinkbiganalytics.discovery.model.DefaultFileSchema) Schema(com.thinkbiganalytics.discovery.schema.Schema) DefaultHiveSchema(com.thinkbiganalytics.discovery.model.DefaultHiveSchema) DefaultTableSchema(com.thinkbiganalytics.discovery.model.DefaultTableSchema) StringReader(java.io.StringReader) DefaultFileSchema(com.thinkbiganalytics.discovery.model.DefaultFileSchema) Reader(java.io.Reader) StringReader(java.io.StringReader) CSVFormat(org.apache.commons.csv.CSVFormat)

Aggregations

CSVFormat (org.apache.commons.csv.CSVFormat)59 IOException (java.io.IOException)23 CSVRecord (org.apache.commons.csv.CSVRecord)22 CSVParser (org.apache.commons.csv.CSVParser)19 ArrayList (java.util.ArrayList)14 StringReader (java.io.StringReader)13 CSVPrinter (org.apache.commons.csv.CSVPrinter)10 InputStream (java.io.InputStream)9 InputStreamReader (java.io.InputStreamReader)8 HashMap (java.util.HashMap)8 SimpleRecordSchema (org.apache.nifi.serialization.SimpleRecordSchema)8 RecordField (org.apache.nifi.serialization.record.RecordField)8 RecordSchema (org.apache.nifi.serialization.record.RecordSchema)8 Test (org.junit.Test)8 ByteArrayOutputStream (java.io.ByteArrayOutputStream)7 Reader (java.io.Reader)7 LinkedHashMap (java.util.LinkedHashMap)7 SchemaNameAsAttribute (org.apache.nifi.schema.access.SchemaNameAsAttribute)7 MapRecord (org.apache.nifi.serialization.record.MapRecord)7 Record (org.apache.nifi.serialization.record.Record)7