use of org.embulk.spi.util.LineDecoder in project embulk by embulk.
the class CsvParserPlugin method run.
@Override
public void run(TaskSource taskSource, final Schema schema, FileInput input, PageOutput output) {
PluginTask task = taskSource.loadTask(PluginTask.class);
final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getSchemaConfig());
final JsonParser jsonParser = new JsonParser();
final CsvTokenizer tokenizer = new CsvTokenizer(new LineDecoder(input, task), task);
final boolean allowOptionalColumns = task.getAllowOptionalColumns();
final boolean allowExtraColumns = task.getAllowExtraColumns();
final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
final int skipHeaderLines = task.getSkipHeaderLines();
try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
while (tokenizer.nextFile()) {
// skip the header lines for each file
for (int skipHeaderLineNumber = skipHeaderLines; skipHeaderLineNumber > 0; skipHeaderLineNumber--) {
if (!tokenizer.skipHeaderLine()) {
break;
}
}
if (!tokenizer.nextRecord()) {
// empty file
continue;
}
while (true) {
boolean hasNextRecord;
try {
schema.visitColumns(new ColumnVisitor() {
public void booleanColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
pageBuilder.setBoolean(column, TRUE_STRINGS.contains(v));
}
}
public void longColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
try {
pageBuilder.setLong(column, Long.parseLong(v));
} catch (NumberFormatException e) {
// TODO support default value
throw new CsvRecordValidateException(e);
}
}
}
public void doubleColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
try {
pageBuilder.setDouble(column, Double.parseDouble(v));
} catch (NumberFormatException e) {
// TODO support default value
throw new CsvRecordValidateException(e);
}
}
}
public void stringColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
pageBuilder.setString(column, v);
}
}
public void timestampColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
try {
pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v));
} catch (TimestampParseException e) {
// TODO support default value
throw new CsvRecordValidateException(e);
}
}
}
public void jsonColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
try {
pageBuilder.setJson(column, jsonParser.parse(v));
} catch (JsonParseException e) {
// TODO support default value
throw new CsvRecordValidateException(e);
}
}
}
private String nextColumn() {
if (allowOptionalColumns && !tokenizer.hasNextColumn()) {
// TODO warning
return null;
}
return tokenizer.nextColumnOrNull();
}
});
try {
hasNextRecord = tokenizer.nextRecord();
} catch (CsvTokenizer.TooManyColumnsException ex) {
if (allowExtraColumns) {
String tooManyColumnsLine = tokenizer.skipCurrentLine();
// TODO warning
hasNextRecord = tokenizer.nextRecord();
} else {
// this line will be skipped at the following catch section
throw ex;
}
}
pageBuilder.addRecord();
} catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
String skippedLine = tokenizer.skipCurrentLine();
long lineNumber = tokenizer.getCurrentLineNumber();
if (stopOnInvalidRecord) {
throw new DataException(String.format("Invalid record at line %d: %s", lineNumber, skippedLine), e);
}
log.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), skippedLine));
// exec.notice().skippedLine(skippedLine);
hasNextRecord = tokenizer.nextRecord();
}
if (!hasNextRecord) {
break;
}
}
}
pageBuilder.finish();
}
}
use of org.embulk.spi.util.LineDecoder in project embulk by embulk.
the class TestCsvTokenizer method recoverFromQuotedSizeLimitExceededException.
@Test
public void recoverFromQuotedSizeLimitExceededException() throws Exception {
config.set("max_quoted_size_limit", 12);
reloadPluginTask();
String[] lines = new String[] { "v1,v2", // this is a broken line and should be skipped
"v3,\"0123", // this line should be not be skiped
"v4,v5", // this line should be not be skiped
"v6,v7" };
FileInput input = newFileInputFromLines(task, lines);
LineDecoder decoder = new LineDecoder(input, task);
CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
Schema schema = task.getSchemaConfig().toSchema();
tokenizer.nextFile();
assertTrue(tokenizer.nextRecord());
assertEquals("v1", tokenizer.nextColumn());
assertEquals("v2", tokenizer.nextColumn());
assertTrue(tokenizer.nextRecord());
assertEquals("v3", tokenizer.nextColumn());
try {
tokenizer.nextColumn();
fail();
} catch (Exception e) {
assertTrue(e instanceof CsvTokenizer.QuotedSizeLimitExceededException);
}
assertEquals("v3,\"0123", tokenizer.skipCurrentLine());
assertTrue(tokenizer.nextRecord());
assertEquals("v4", tokenizer.nextColumn());
assertEquals("v5", tokenizer.nextColumn());
assertTrue(tokenizer.nextRecord());
assertEquals("v6", tokenizer.nextColumn());
assertEquals("v7", tokenizer.nextColumn());
}
use of org.embulk.spi.util.LineDecoder in project embulk by embulk.
the class TestCsvTokenizer method parse.
private static List<List<String>> parse(CsvParserPlugin.PluginTask task, FileInput input) {
LineDecoder decoder = new LineDecoder(input, task);
CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
Schema schema = task.getSchemaConfig().toSchema();
tokenizer.nextFile();
List<List<String>> records = new ArrayList<>();
while (tokenizer.nextRecord()) {
List<String> record = new ArrayList<>();
for (Column c : schema.getColumns()) {
String v = tokenizer.nextColumnOrNull();
record.add(v);
}
records.add(record);
}
return records;
}
Aggregations