use of org.embulk.spi.Page in project embulk by embulk.
the class Pages method toObjects.
// TODO use streaming and return Iterable
public static List<Object[]> toObjects(Schema schema, Iterable<Page> pages) {
ImmutableList.Builder<Object[]> builder = ImmutableList.builder();
Iterator<Page> ite = pages.iterator();
try (PageReader reader = new PageReader(schema)) {
while (ite.hasNext()) {
reader.setPage(ite.next());
while (reader.nextRecord()) {
builder.add(toObjects(reader));
}
}
}
return builder.build();
}
use of org.embulk.spi.Page in project embulk by embulk.
the class GuessExecutor method guessParserConfig.
private ConfigDiff guessParserConfig(Buffer sample, ConfigSource config, List<PluginType> guessPlugins, final int guessParserSampleBufferBytes) {
// repeat guessing upto 10 times
ConfigDiff lastGuessed = Exec.newConfigDiff();
for (int i = 0; i < 10; i++) {
// include last-guessed config to run guess input
ConfigSource originalConfig = config.deepCopy().merge(lastGuessed);
ConfigSource guessInputConfig = originalConfig.deepCopy();
guessInputConfig.getNestedOrSetEmpty("parser").set("type", // override in.parser.type so that FileInputRunner.run uses GuessParserPlugin
"system_guess").set("guess_plugins", guessPlugins).set("orig_config", originalConfig).set("guess_parser_sample_buffer_bytes", guessParserSampleBufferBytes);
// run FileInputPlugin
final FileInputRunner input = new FileInputRunner(new BufferFileInputPlugin(sample));
ConfigDiff guessed;
try {
input.transaction(guessInputConfig, new InputPlugin.Control() {
public List<TaskReport> run(TaskSource inputTaskSource, Schema schema, int taskCount) {
if (taskCount == 0) {
throw new NoSampleException("No input files to guess");
}
input.run(inputTaskSource, null, 0, new PageOutput() {
@Override
public void add(Page page) {
// TODO exception class
throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration");
}
@Override
public void finish() {
}
@Override
public void close() {
}
});
throw new AssertionError("Guess executor must throw GuessedNoticeError");
}
});
throw new AssertionError("Guess executor must throw GuessedNoticeError");
} catch (GuessedNoticeError error) {
guessed = lastGuessed.deepCopy().merge(error.getGuessedConfig());
}
// merge to the last-guessed config
if (lastGuessed.equals(guessed)) {
// not changed
return lastGuessed;
}
lastGuessed = guessed;
}
return lastGuessed;
}
use of org.embulk.spi.Page in project embulk by embulk.
the class CsvFormatterPlugin method open.
@Override
public PageOutput open(TaskSource taskSource, final Schema schema, FileOutput output) {
final PluginTask task = taskSource.loadTask(PluginTask.class);
final LineEncoder encoder = new LineEncoder(output, task);
final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
final char delimiter = task.getDelimiterChar();
final QuotePolicy quotePolicy = task.getQuotePolicy();
final char quote = task.getQuoteChar() != '\0' ? task.getQuoteChar() : '"';
final char escape = task.getEscapeChar().or(quotePolicy == QuotePolicy.NONE ? '\\' : quote);
final String newlineInField = task.getNewlineInField().getString();
final String nullString = task.getNullString();
// create a file
encoder.nextFile();
// write header
if (task.getHeaderLine()) {
writeHeader(schema, encoder, delimiter, quotePolicy, quote, escape, newlineInField, nullString);
}
return new PageOutput() {
private final PageReader pageReader = new PageReader(schema);
private final String delimiterString = String.valueOf(delimiter);
public void add(Page page) {
pageReader.setPage(page);
while (pageReader.nextRecord()) {
schema.visitColumns(new ColumnVisitor() {
public void booleanColumn(Column column) {
addDelimiter(column);
if (!pageReader.isNull(column)) {
addValue(Boolean.toString(pageReader.getBoolean(column)));
} else {
addNullString();
}
}
public void longColumn(Column column) {
addDelimiter(column);
if (!pageReader.isNull(column)) {
addValue(Long.toString(pageReader.getLong(column)));
} else {
addNullString();
}
}
public void doubleColumn(Column column) {
addDelimiter(column);
if (!pageReader.isNull(column)) {
addValue(Double.toString(pageReader.getDouble(column)));
} else {
addNullString();
}
}
public void stringColumn(Column column) {
addDelimiter(column);
if (!pageReader.isNull(column)) {
addValue(pageReader.getString(column));
} else {
addNullString();
}
}
public void timestampColumn(Column column) {
addDelimiter(column);
if (!pageReader.isNull(column)) {
Timestamp value = pageReader.getTimestamp(column);
addValue(timestampFormatters[column.getIndex()].format(value));
} else {
addNullString();
}
}
public void jsonColumn(Column column) {
addDelimiter(column);
if (!pageReader.isNull(column)) {
Value value = pageReader.getJson(column);
addValue(value.toJson());
} else {
addNullString();
}
}
private void addDelimiter(Column column) {
if (column.getIndex() != 0) {
encoder.addText(delimiterString);
}
}
private void addValue(String v) {
encoder.addText(setEscapeAndQuoteValue(v, delimiter, quotePolicy, quote, escape, newlineInField, nullString));
}
private void addNullString() {
encoder.addText(nullString);
}
});
encoder.addNewLine();
}
}
public void finish() {
encoder.finish();
}
public void close() {
encoder.close();
}
};
}
use of org.embulk.spi.Page in project embulk by embulk.
the class SamplingParserPlugin method runFileInputSampling.
public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig, ConfigSource sampleBufferConfig) {
final SampleBufferTask sampleBufferTask = sampleBufferConfig.loadConfig(SampleBufferTask.class);
// override in.parser.type so that FileInputRunner creates SamplingParserPlugin
ConfigSource samplingInputConfig = inputConfig.deepCopy();
samplingInputConfig.getNestedOrSetEmpty("parser").set("type", "system_sampling").set("sample_buffer_bytes", sampleBufferTask.getSampleBufferBytes());
samplingInputConfig.set("decoders", null);
try {
runner.transaction(samplingInputConfig, new InputPlugin.Control() {
public List<TaskReport> run(TaskSource taskSource, Schema schema, int taskCount) {
if (taskCount == 0) {
throw new NoSampleException("No input files to read sample data");
}
int maxSize = -1;
int maxSizeTaskIndex = -1;
for (int taskIndex = 0; taskIndex < taskCount; taskIndex++) {
try {
runner.run(taskSource, schema, taskIndex, new PageOutput() {
@Override
public void add(Page page) {
// TODO exception class
throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration");
}
public void finish() {
}
public void close() {
}
});
} catch (NotEnoughSampleError ex) {
if (maxSize < ex.getSize()) {
maxSize = ex.getSize();
maxSizeTaskIndex = taskIndex;
}
continue;
}
}
if (maxSize <= 0) {
throw new NoSampleException("All input files are empty");
}
taskSource.getNested("ParserTaskSource").set("force", true);
try {
runner.run(taskSource, schema, maxSizeTaskIndex, new PageOutput() {
@Override
public void add(Page page) {
// TODO exception class
throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration");
}
public void finish() {
}
public void close() {
}
});
} catch (NotEnoughSampleError ex) {
throw new NoSampleException("All input files are smaller than minimum sampling size");
}
throw new NoSampleException("All input files are smaller than minimum sampling size");
}
});
throw new AssertionError("SamplingParserPlugin must throw SampledNoticeError");
} catch (SampledNoticeError error) {
return error.getSample();
}
}
Aggregations