use of org.embulk.spi.Column in project embulk by embulk.
the class ConfigInputPlugin method run.
@Override
public TaskReport run(TaskSource taskSource, Schema schema, int taskIndex, PageOutput output) {
final PluginTask task = taskSource.loadTask(PluginTask.class);
final List<List<JsonNode>> taskValues = task.getValues().get(taskIndex);
final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getSchemaConfig());
final JsonParser jsonParser = new JsonParser();
try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
for (final List<JsonNode> rowValues : taskValues) {
schema.visitColumns(new ColumnVisitor() {
public void booleanColumn(Column column) {
final JsonNode value = rowValues.get(column.getIndex());
if (value == null || value.isNull()) {
pageBuilder.setNull(column);
} else {
pageBuilder.setBoolean(column, value.asBoolean());
}
}
public void longColumn(Column column) {
final JsonNode value = rowValues.get(column.getIndex());
if (value == null || value.isNull()) {
pageBuilder.setNull(column);
} else {
pageBuilder.setLong(column, value.asLong());
}
}
public void doubleColumn(Column column) {
final JsonNode value = rowValues.get(column.getIndex());
if (value == null || value.isNull()) {
pageBuilder.setNull(column);
} else {
pageBuilder.setDouble(column, value.asDouble());
}
}
public void stringColumn(Column column) {
final JsonNode value = rowValues.get(column.getIndex());
if (value == null || value.isNull()) {
pageBuilder.setNull(column);
} else {
pageBuilder.setString(column, value.asText());
}
}
public void timestampColumn(Column column) {
final JsonNode value = rowValues.get(column.getIndex());
if (value == null || value.isNull()) {
pageBuilder.setNull(column);
} else {
try {
pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(value.asText()));
} catch (TimestampParseException ex) {
throw new DataException(ex);
}
}
}
public void jsonColumn(Column column) {
final JsonNode value = rowValues.get(column.getIndex());
if (value == null || value.isNull()) {
pageBuilder.setNull(column);
} else {
try {
pageBuilder.setJson(column, jsonParser.parse(value.toString()));
} catch (JsonParseException ex) {
throw new DataException(ex);
}
}
}
});
pageBuilder.addRecord();
}
pageBuilder.finish();
}
return Exec.newTaskReport();
}
use of org.embulk.spi.Column in project embulk by embulk.
the class CsvParserPlugin method run.
@Override
public void run(TaskSource taskSource, final Schema schema, FileInput input, PageOutput output) {
PluginTask task = taskSource.loadTask(PluginTask.class);
final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getSchemaConfig());
final JsonParser jsonParser = new JsonParser();
final CsvTokenizer tokenizer = new CsvTokenizer(new LineDecoder(input, task), task);
final boolean allowOptionalColumns = task.getAllowOptionalColumns();
final boolean allowExtraColumns = task.getAllowExtraColumns();
final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
final int skipHeaderLines = task.getSkipHeaderLines();
try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
while (tokenizer.nextFile()) {
// skip the header lines for each file
for (int skipHeaderLineNumber = skipHeaderLines; skipHeaderLineNumber > 0; skipHeaderLineNumber--) {
if (!tokenizer.skipHeaderLine()) {
break;
}
}
if (!tokenizer.nextRecord()) {
// empty file
continue;
}
while (true) {
boolean hasNextRecord;
try {
schema.visitColumns(new ColumnVisitor() {
public void booleanColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
pageBuilder.setBoolean(column, TRUE_STRINGS.contains(v));
}
}
public void longColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
try {
pageBuilder.setLong(column, Long.parseLong(v));
} catch (NumberFormatException e) {
// TODO support default value
throw new CsvRecordValidateException(e);
}
}
}
public void doubleColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
try {
pageBuilder.setDouble(column, Double.parseDouble(v));
} catch (NumberFormatException e) {
// TODO support default value
throw new CsvRecordValidateException(e);
}
}
}
public void stringColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
pageBuilder.setString(column, v);
}
}
public void timestampColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
try {
pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v));
} catch (TimestampParseException e) {
// TODO support default value
throw new CsvRecordValidateException(e);
}
}
}
public void jsonColumn(Column column) {
String v = nextColumn();
if (v == null) {
pageBuilder.setNull(column);
} else {
try {
pageBuilder.setJson(column, jsonParser.parse(v));
} catch (JsonParseException e) {
// TODO support default value
throw new CsvRecordValidateException(e);
}
}
}
private String nextColumn() {
if (allowOptionalColumns && !tokenizer.hasNextColumn()) {
// TODO warning
return null;
}
return tokenizer.nextColumnOrNull();
}
});
try {
hasNextRecord = tokenizer.nextRecord();
} catch (CsvTokenizer.TooManyColumnsException ex) {
if (allowExtraColumns) {
String tooManyColumnsLine = tokenizer.skipCurrentLine();
// TODO warning
hasNextRecord = tokenizer.nextRecord();
} else {
// this line will be skipped at the following catch section
throw ex;
}
}
pageBuilder.addRecord();
} catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
String skippedLine = tokenizer.skipCurrentLine();
long lineNumber = tokenizer.getCurrentLineNumber();
if (stopOnInvalidRecord) {
throw new DataException(String.format("Invalid record at line %d: %s", lineNumber, skippedLine), e);
}
log.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), skippedLine));
// exec.notice().skippedLine(skippedLine);
hasNextRecord = tokenizer.nextRecord();
}
if (!hasNextRecord) {
break;
}
}
}
pageBuilder.finish();
}
}
use of org.embulk.spi.Column in project embulk by embulk.
the class JsonParserPlugin method run.
@Override
public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output) {
PluginTask task = taskSource.loadTask(PluginTask.class);
final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
// record column
final Column column = schema.getColumn(0);
try (PageBuilder pageBuilder = newPageBuilder(schema, output);
FileInputInputStream in = new FileInputInputStream(input)) {
while (in.nextFile()) {
boolean evenOneJsonParsed = false;
try (JsonParser.Stream stream = newJsonStream(in, task)) {
Value value;
while ((value = stream.next()) != null) {
try {
if (!value.isMapValue()) {
throw new JsonRecordValidateException(String.format("A Json record must not represent map value but it's %s", value.getValueType().name()));
}
pageBuilder.setJson(column, value);
pageBuilder.addRecord();
evenOneJsonParsed = true;
} catch (JsonRecordValidateException e) {
if (stopOnInvalidRecord) {
throw new DataException(String.format("Invalid record: %s", value.toJson()), e);
}
log.warn(String.format("Skipped record (%s): %s", e.getMessage(), value.toJson()));
}
}
} catch (IOException | JsonParseException e) {
if (Exec.isPreview() && evenOneJsonParsed) {
// ignore in preview if at least one JSON is already parsed.
break;
}
throw new DataException(e);
}
}
pageBuilder.finish();
}
}
use of org.embulk.spi.Column in project embulk by embulk.
the class Timestamps method newTimestampColumnFormatters.
public static TimestampFormatter[] newTimestampColumnFormatters(TimestampFormatter.Task formatterTask, Schema schema, Map<String, ? extends TimestampFormatter.TimestampColumnOption> columnOptions) {
TimestampFormatter[] formatters = new TimestampFormatter[schema.getColumnCount()];
int i = 0;
for (Column column : schema.getColumns()) {
if (column.getType() instanceof TimestampType) {
Optional<TimestampFormatter.TimestampColumnOption> option = Optional.fromNullable(columnOptions.get(column.getName()));
formatters[i] = TimestampFormatter.of(formatterTask, option);
}
i++;
}
return formatters;
}
use of org.embulk.spi.Column in project embulk by embulk.
the class CsvFormatterPlugin method writeHeader.
private void writeHeader(Schema schema, LineEncoder encoder, char delimiter, QuotePolicy policy, char quote, char escape, String newline, String nullString) {
String delimiterString = String.valueOf(delimiter);
for (Column column : schema.getColumns()) {
if (column.getIndex() != 0) {
encoder.addText(delimiterString);
}
encoder.addText(setEscapeAndQuoteValue(column.getName(), delimiter, policy, quote, escape, newline, nullString));
}
encoder.addNewLine();
}
Aggregations