use of org.talend.dataprep.api.dataset.ColumnMetadata in project data-prep by Talend.
the class XlsRunnable method run.
/**
* @see Runnable#run()
*/
@Override
public void run() {
try {
Workbook workbook = WorkbookFactory.create(rawContent);
JsonGenerator generator = jsonFactory.createGenerator(jsonOutput);
// if no sheet name just get the first one (take it easy mate :-) )
Sheet sheet = isEmpty(metadata.getSheetName()) ? workbook.getSheetAt(0) : workbook.getSheet(metadata.getSheetName());
if (sheet == null) {
// auto generated sheet name so take care!! "sheet-" + i
if (StringUtils.startsWith(metadata.getSheetName(), "sheet-")) {
String sheetNumberStr = StringUtils.removeStart(metadata.getSheetName(), "sheet-");
sheet = workbook.getSheetAt(Integer.valueOf(sheetNumberStr));
}
// still null so use the first one
if (sheet == null) {
sheet = workbook.getSheetAt(0);
}
}
generator.writeStartArray();
List<ColumnMetadata> columns = metadata.getRowMetadata().getColumns();
serializeColumns(workbook, generator, sheet, columns);
generator.writeEndArray();
generator.flush();
} catch (Exception e) {
// Consumer may very well interrupt consumption of stream (in case of limit(n) use for sampling).
// This is not an issue as consumer is allowed to partially consumes results, it's up to the
// consumer to ensure data it consumed is consistent.
LOG.debug("Unable to continue serialization for {}. Skipping remaining content.", metadata.getId(), e);
} finally {
try {
jsonOutput.close();
} catch (IOException e) {
LOG.error("Unable to close output", e);
}
}
}
use of org.talend.dataprep.api.dataset.ColumnMetadata in project data-prep by Talend.
the class XlsxStreamRunnable method run.
/**
* @see Runnable#run()
*/
@Override
public void run() {
try {
JsonGenerator generator = jsonFactory.createGenerator(jsonOutput);
Workbook workbook = //
StreamingReader.builder().bufferSize(//
4096).rowCacheSize(//
1).open(rawContent);
try {
Sheet sheet = //
StringUtils.isEmpty(metadata.getSheetName()) ? workbook.getSheetAt(0) : workbook.getSheet(metadata.getSheetName());
generator.writeStartArray();
for (Row row : sheet) {
if (limit > 0 && row.getRowNum() > limit) {
break;
}
if (!XlsSerializer.isHeaderLine(row.getRowNum(), metadata.getRowMetadata().getColumns())) {
generator.writeStartObject();
// data quality Analyzer doesn't like to not have all columns even if we don't have any values
// so create so field with empty value otherwise we get exceptions
int i = 0;
for (ColumnMetadata columnMetadata : metadata.getRowMetadata().getColumns()) {
Cell cell = row.getCell(i);
// StringUtils.EMPTY
String cellValue = cell == null ? null : cell.getStringCellValue();
generator.writeFieldName(columnMetadata.getId());
if (cellValue != null) {
generator.writeString(cellValue);
} else {
generator.writeNull();
}
i++;
}
generator.writeEndObject();
}
}
generator.writeEndArray();
generator.flush();
} finally {
workbook.close();
}
} catch (Exception e) {
// Consumer may very well interrupt consumption of stream (in case of limit(n) use for sampling).
// This is not an issue as consumer is allowed to partially consumes results, it's up to the
// consumer to ensure data it consumed is consistent.
LOG.debug("Unable to continue serialization for {}. Skipping remaining content.", metadata.getId(), e);
} finally {
try {
jsonOutput.close();
} catch (IOException e) {
LOG.error("Unable to close output", e);
}
}
}
use of org.talend.dataprep.api.dataset.ColumnMetadata in project data-prep by Talend.
the class CSVSerializer method writeLineContent.
/**
* Write the line content.
*
* @param reader the csv reader to use as data source.
* @param metadata the dataset metadata to use to get the columns.
* @param generator the json generator used to actually write the line content.
* @param separator the csv separator to use.
* @param limit The maximum number of lines in the exported content.
* @throws IOException if an error occurs.
*/
private void writeLineContent(CSVReader reader, DataSetMetadata metadata, JsonGenerator generator, String separator, long limit) throws IOException {
String[] line;
int current = 0;
while ((line = reader.readNext()) != null && withinLimit(limit, current)) {
// skip empty lines
if (line.length == 1 && (StringUtils.isEmpty(line[0]) || line[0].charAt(0) == Character.MIN_VALUE)) {
continue;
}
List<ColumnMetadata> columns = metadata.getRowMetadata().getColumns();
generator.writeStartObject();
int columnsSize = columns.size();
for (int i = 0; i < columnsSize; i++) {
ColumnMetadata columnMetadata = columns.get(i);
generator.writeFieldName(columnMetadata.getId());
// deal with additional content (line.length > columns.size)
if (i == columnsSize - 1 && line.length > columnsSize) {
String additionalContent = getRemainingColumns(line, i, separator);
generator.writeString(cleanCharacters(additionalContent));
} else // deal with fewer content (line.length < columns.size)
if (i < line.length && line[i] != null) {
generator.writeString(cleanCharacters(line[i]));
} else // deal with null
{
generator.writeNull();
}
}
generator.writeEndObject();
current++;
}
}
use of org.talend.dataprep.api.dataset.ColumnMetadata in project data-prep by Talend.
the class DateCalendarConverterTest method testAdapt.
@Test
public void testAdapt() throws Exception {
assertThat(action.adapt((ColumnMetadata) null), is(action));
ColumnMetadata column = column().name("myColumn").id(0).type(Type.STRING).build();
assertThat(action.adapt(column), is(action));
}
use of org.talend.dataprep.api.dataset.ColumnMetadata in project data-prep by Talend.
the class DateParserTest method shouldNotComputePatternFromDQBecauseNullValue.
@Test(expected = DateTimeException.class)
public void shouldNotComputePatternFromDQBecauseNullValue() {
final ColumnMetadata column = ActionMetadataTestUtils.getColumn(Type.DATE);
action.guessPattern(null, column);
}
Aggregations