Search in sources :

Example 71 with CSVRecord

use of org.apache.commons.csv.CSVRecord in project tika by apache.

the class ISATabUtils method parseStudy.

public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
    TikaInputStream tis = TikaInputStream.get(stream);
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
        CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        Iterator<CSVRecord> iterator = csvParser.iterator();
        xhtml.startElement("table");
        xhtml.startElement("thead");
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            for (int i = 0; i < record.size(); i++) {
                xhtml.startElement("th");
                xhtml.characters(record.get(i));
                xhtml.endElement("th");
            }
        }
        xhtml.endElement("thead");
        xhtml.startElement("tbody");
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            xhtml.startElement("tr");
            for (int j = 0; j < record.size(); j++) {
                xhtml.startElement("td");
                xhtml.characters(record.get(j));
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) CSVParser(org.apache.commons.csv.CSVParser) TikaInputStream(org.apache.tika.io.TikaInputStream) CSVRecord(org.apache.commons.csv.CSVRecord) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 72 with CSVRecord

use of org.apache.commons.csv.CSVRecord in project tika by apache.

the class ISATabUtils method extractMetadata.

private static void extractMetadata(Reader reader, Metadata metadata, String studyFileName) throws IOException {
    boolean investigationSection = false;
    boolean studySection = false;
    boolean studyTarget = false;
    Map<String, String> map = new HashMap<String, String>();
    try (CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        Iterator<CSVRecord> iterator = csvParser.iterator();
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            String field = record.get(0);
            if ((field.toUpperCase(Locale.ENGLISH).equals(field)) && (record.size() == 1)) {
                investigationSection = Arrays.asList(sections).contains(field);
                studySection = (studyFileName != null) && (field.equals(studySectionField));
            } else {
                if (investigationSection) {
                    addMetadata(field, record, metadata);
                } else if (studySection) {
                    if (studyTarget) {
                        break;
                    }
                    String value = record.get(1);
                    map.put(field, value);
                    studyTarget = (field.equals(studyFileNameField)) && (value.equals(studyFileName));
                    if (studyTarget) {
                        mapStudyToMetadata(map, metadata);
                        studySection = false;
                    }
                } else if (studyTarget) {
                    addMetadata(field, record, metadata);
                }
            }
        }
    } catch (IOException ioe) {
        throw ioe;
    }
}
Also used : HashMap(java.util.HashMap) CSVParser(org.apache.commons.csv.CSVParser) CSVRecord(org.apache.commons.csv.CSVRecord) IOException(java.io.IOException)

Example 73 with CSVRecord

use of org.apache.commons.csv.CSVRecord in project tika by apache.

the class ISATabUtils method parseAssay.

public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
    TikaInputStream tis = TikaInputStream.get(stream);
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
        CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
        xhtml.startElement("table");
        Iterator<CSVRecord> iterator = csvParser.iterator();
        xhtml.startElement("thead");
        if (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            for (int i = 0; i < record.size(); i++) {
                xhtml.startElement("th");
                xhtml.characters(record.get(i));
                xhtml.endElement("th");
            }
        }
        xhtml.endElement("thead");
        xhtml.startElement("tbody");
        while (iterator.hasNext()) {
            CSVRecord record = iterator.next();
            xhtml.startElement("tr");
            for (int j = 0; j < record.size(); j++) {
                xhtml.startElement("td");
                xhtml.characters(record.get(j));
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) CSVParser(org.apache.commons.csv.CSVParser) TikaInputStream(org.apache.tika.io.TikaInputStream) CSVRecord(org.apache.commons.csv.CSVRecord) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 74 with CSVRecord

use of org.apache.commons.csv.CSVRecord in project kylo by Teradata.

the class CSVAutoDetect method guessDelimiter.

private Character guessDelimiter(List<LineStats> lineStats, String value, Character quote, boolean headerRow) throws IOException {
    // Assume delimiter exists in first line and compare to subsequent lines
    if (lineStats.size() > 0) {
        LineStats firstLineStat = lineStats.get(0);
        Map<Character, Integer> firstLineDelimCounts = firstLineStat.calcDelimCountsOrdered();
        if (firstLineDelimCounts != null && firstLineDelimCounts.size() > 0) {
            List<Character> candidates = new ArrayList<>();
            // Attempt to parse given delimiter
            Set<Character> firstLineDelimKeys = sortDelimitersIntoPreferredOrder(firstLineDelimCounts.keySet());
            for (Character delim : firstLineDelimKeys) {
                CSVFormat format;
                if (headerRow) {
                    format = CSVFormat.DEFAULT.withFirstRecordAsHeader().withDelimiter(delim).withQuote(quote);
                } else {
                    format = CSVFormat.DEFAULT.withDelimiter(delim).withQuote(quote);
                }
                try (StringReader sr = new StringReader(value)) {
                    try (CSVParser parser = format.parse(sr)) {
                        if (parser.getHeaderMap() != null) {
                            int size = parser.getHeaderMap().size();
                            List<CSVRecord> records = parser.getRecords();
                            boolean match = records.stream().allMatch(record -> record.size() == size);
                            if (match) {
                                return delim;
                            }
                        }
                    }
                }
                Integer delimCount = firstLineDelimCounts.get(delim);
                boolean match = true;
                for (int i = 1; i < lineStats.size() && match; i++) {
                    LineStats thisLine = lineStats.get(i);
                    Integer rowDelimCount = thisLine.delimStats.get(delim);
                    match = delimCount.equals(rowDelimCount);
                }
                if (match) {
                    candidates.add(delim);
                }
            }
            if (candidates.size() > 0) {
                // All agree on a single delimiter
                if (candidates.size() == 1) {
                    return candidates.get(0);
                } else {
                    int count = 0;
                    // Return highest delimiter from candidates
                    for (Character delim : firstLineDelimKeys) {
                        if (candidates.get(count++) != null) {
                            return delim;
                        }
                    }
                }
            }
        }
    }
    return null;
}
Also used : ArrayList(java.util.ArrayList) CSVParser(org.apache.commons.csv.CSVParser) StringReader(java.io.StringReader) CSVFormat(org.apache.commons.csv.CSVFormat) CSVRecord(org.apache.commons.csv.CSVRecord)

Example 75 with CSVRecord

use of org.apache.commons.csv.CSVRecord in project nifi by apache.

the class SimpleCsvFileLookupService method loadCache.

private void loadCache() throws IllegalStateException, IOException {
    if (lock.tryLock()) {
        try {
            final ComponentLog logger = getLogger();
            if (logger.isDebugEnabled()) {
                logger.debug("Loading lookup table from file: " + csvFile);
            }
            final Map<String, String> properties = new HashMap<>();
            final FileReader reader = new FileReader(csvFile);
            final Iterable<CSVRecord> records = csvFormat.withFirstRecordAsHeader().parse(reader);
            for (final CSVRecord record : records) {
                final String key = record.get(lookupKeyColumn);
                final String value = record.get(lookupValueColumn);
                if (StringUtils.isBlank(key)) {
                    throw new IllegalStateException("Empty lookup key encountered in: " + csvFile);
                } else if (!ignoreDuplicates && properties.containsKey(key)) {
                    throw new IllegalStateException("Duplicate lookup key encountered: " + key + " in " + csvFile);
                } else if (ignoreDuplicates && properties.containsKey(key)) {
                    logger.warn("Duplicate lookup key encountered: {} in {}", new Object[] { key, csvFile });
                }
                properties.put(key, value);
            }
            this.cache = new ConcurrentHashMap<>(properties);
            if (cache.isEmpty()) {
                logger.warn("Lookup table is empty after reading file: " + csvFile);
            }
        } finally {
            lock.unlock();
        }
    }
}
Also used : HashMap(java.util.HashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) FileReader(java.io.FileReader) CSVRecord(org.apache.commons.csv.CSVRecord) ComponentLog(org.apache.nifi.logging.ComponentLog)

Aggregations

CSVRecord (org.apache.commons.csv.CSVRecord)127 CSVParser (org.apache.commons.csv.CSVParser)71 IOException (java.io.IOException)40 CSVFormat (org.apache.commons.csv.CSVFormat)40 ArrayList (java.util.ArrayList)36 Reader (java.io.Reader)24 StringReader (java.io.StringReader)22 InputStreamReader (java.io.InputStreamReader)18 FileReader (java.io.FileReader)16 Test (org.junit.Test)14 Path (java.nio.file.Path)13 HashMap (java.util.HashMap)11 File (java.io.File)10 PreparedStatement (java.sql.PreparedStatement)10 InputStream (java.io.InputStream)9 ResultSet (java.sql.ResultSet)9 PhoenixConnection (org.apache.phoenix.jdbc.PhoenixConnection)9 CSVCommonsLoader (org.apache.phoenix.util.CSVCommonsLoader)9 BufferedReader (java.io.BufferedReader)8 Map (java.util.Map)7