use of org.apache.commons.csv.CSVRecord in project tika by apache.
the class ISATabUtils method parseStudy.
public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
TikaInputStream tis = TikaInputStream.get(stream);
// Automatically detect the character encoding
TikaConfig tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
tikaConfig = TikaConfig.getDefaultConfig();
}
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
Iterator<CSVRecord> iterator = csvParser.iterator();
xhtml.startElement("table");
xhtml.startElement("thead");
if (iterator.hasNext()) {
CSVRecord record = iterator.next();
for (int i = 0; i < record.size(); i++) {
xhtml.startElement("th");
xhtml.characters(record.get(i));
xhtml.endElement("th");
}
}
xhtml.endElement("thead");
xhtml.startElement("tbody");
while (iterator.hasNext()) {
CSVRecord record = iterator.next();
xhtml.startElement("tr");
for (int j = 0; j < record.size(); j++) {
xhtml.startElement("td");
xhtml.characters(record.get(j));
xhtml.endElement("td");
}
xhtml.endElement("tr");
}
xhtml.endElement("tbody");
xhtml.endElement("table");
}
}
use of org.apache.commons.csv.CSVRecord in project tika by apache.
the class ISATabUtils method extractMetadata.
private static void extractMetadata(Reader reader, Metadata metadata, String studyFileName) throws IOException {
boolean investigationSection = false;
boolean studySection = false;
boolean studyTarget = false;
Map<String, String> map = new HashMap<String, String>();
try (CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
Iterator<CSVRecord> iterator = csvParser.iterator();
while (iterator.hasNext()) {
CSVRecord record = iterator.next();
String field = record.get(0);
if ((field.toUpperCase(Locale.ENGLISH).equals(field)) && (record.size() == 1)) {
investigationSection = Arrays.asList(sections).contains(field);
studySection = (studyFileName != null) && (field.equals(studySectionField));
} else {
if (investigationSection) {
addMetadata(field, record, metadata);
} else if (studySection) {
if (studyTarget) {
break;
}
String value = record.get(1);
map.put(field, value);
studyTarget = (field.equals(studyFileNameField)) && (value.equals(studyFileName));
if (studyTarget) {
mapStudyToMetadata(map, metadata);
studySection = false;
}
} else if (studyTarget) {
addMetadata(field, record, metadata);
}
}
}
} catch (IOException ioe) {
throw ioe;
}
}
use of org.apache.commons.csv.CSVRecord in project tika by apache.
the class ISATabUtils method parseAssay.
public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
TikaInputStream tis = TikaInputStream.get(stream);
// Automatically detect the character encoding
TikaConfig tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
tikaConfig = TikaConfig.getDefaultConfig();
}
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis), metadata, tikaConfig.getEncodingDetector());
CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
xhtml.startElement("table");
Iterator<CSVRecord> iterator = csvParser.iterator();
xhtml.startElement("thead");
if (iterator.hasNext()) {
CSVRecord record = iterator.next();
for (int i = 0; i < record.size(); i++) {
xhtml.startElement("th");
xhtml.characters(record.get(i));
xhtml.endElement("th");
}
}
xhtml.endElement("thead");
xhtml.startElement("tbody");
while (iterator.hasNext()) {
CSVRecord record = iterator.next();
xhtml.startElement("tr");
for (int j = 0; j < record.size(); j++) {
xhtml.startElement("td");
xhtml.characters(record.get(j));
xhtml.endElement("td");
}
xhtml.endElement("tr");
}
xhtml.endElement("tbody");
xhtml.endElement("table");
}
}
use of org.apache.commons.csv.CSVRecord in project kylo by Teradata.
the class CSVAutoDetect method guessDelimiter.
private Character guessDelimiter(List<LineStats> lineStats, String value, Character quote, boolean headerRow) throws IOException {
// Assume delimiter exists in first line and compare to subsequent lines
if (lineStats.size() > 0) {
LineStats firstLineStat = lineStats.get(0);
Map<Character, Integer> firstLineDelimCounts = firstLineStat.calcDelimCountsOrdered();
if (firstLineDelimCounts != null && firstLineDelimCounts.size() > 0) {
List<Character> candidates = new ArrayList<>();
// Attempt to parse given delimiter
Set<Character> firstLineDelimKeys = sortDelimitersIntoPreferredOrder(firstLineDelimCounts.keySet());
for (Character delim : firstLineDelimKeys) {
CSVFormat format;
if (headerRow) {
format = CSVFormat.DEFAULT.withFirstRecordAsHeader().withDelimiter(delim).withQuote(quote);
} else {
format = CSVFormat.DEFAULT.withDelimiter(delim).withQuote(quote);
}
try (StringReader sr = new StringReader(value)) {
try (CSVParser parser = format.parse(sr)) {
if (parser.getHeaderMap() != null) {
int size = parser.getHeaderMap().size();
List<CSVRecord> records = parser.getRecords();
boolean match = records.stream().allMatch(record -> record.size() == size);
if (match) {
return delim;
}
}
}
}
Integer delimCount = firstLineDelimCounts.get(delim);
boolean match = true;
for (int i = 1; i < lineStats.size() && match; i++) {
LineStats thisLine = lineStats.get(i);
Integer rowDelimCount = thisLine.delimStats.get(delim);
match = delimCount.equals(rowDelimCount);
}
if (match) {
candidates.add(delim);
}
}
if (candidates.size() > 0) {
// All agree on a single delimiter
if (candidates.size() == 1) {
return candidates.get(0);
} else {
int count = 0;
// Return highest delimiter from candidates
for (Character delim : firstLineDelimKeys) {
if (candidates.get(count++) != null) {
return delim;
}
}
}
}
}
}
return null;
}
use of org.apache.commons.csv.CSVRecord in project nifi by apache.
the class SimpleCsvFileLookupService method loadCache.
private void loadCache() throws IllegalStateException, IOException {
if (lock.tryLock()) {
try {
final ComponentLog logger = getLogger();
if (logger.isDebugEnabled()) {
logger.debug("Loading lookup table from file: " + csvFile);
}
final Map<String, String> properties = new HashMap<>();
final FileReader reader = new FileReader(csvFile);
final Iterable<CSVRecord> records = csvFormat.withFirstRecordAsHeader().parse(reader);
for (final CSVRecord record : records) {
final String key = record.get(lookupKeyColumn);
final String value = record.get(lookupValueColumn);
if (StringUtils.isBlank(key)) {
throw new IllegalStateException("Empty lookup key encountered in: " + csvFile);
} else if (!ignoreDuplicates && properties.containsKey(key)) {
throw new IllegalStateException("Duplicate lookup key encountered: " + key + " in " + csvFile);
} else if (ignoreDuplicates && properties.containsKey(key)) {
logger.warn("Duplicate lookup key encountered: {} in {}", new Object[] { key, csvFile });
}
properties.put(key, value);
}
this.cache = new ConcurrentHashMap<>(properties);
if (cache.isEmpty()) {
logger.warn("Lookup table is empty after reading file: " + csvFile);
}
} finally {
lock.unlock();
}
}
}
Aggregations