Search in sources :

Example 1 with Page

use of org.embulk.spi.Page in project embulk by embulk.

the class Pages method toObjects.

// TODO use streaming and return Iterable
public static List<Object[]> toObjects(Schema schema, Iterable<Page> pages) {
    ImmutableList.Builder<Object[]> builder = ImmutableList.builder();
    Iterator<Page> ite = pages.iterator();
    try (PageReader reader = new PageReader(schema)) {
        while (ite.hasNext()) {
            reader.setPage(ite.next());
            while (reader.nextRecord()) {
                builder.add(toObjects(reader));
            }
        }
    }
    return builder.build();
}
Also used : ImmutableList(com.google.common.collect.ImmutableList) PageReader(org.embulk.spi.PageReader) Page(org.embulk.spi.Page)

Example 2 with Page

use of org.embulk.spi.Page in project embulk by embulk.

the class GuessExecutor method guessParserConfig.

private ConfigDiff guessParserConfig(Buffer sample, ConfigSource config, List<PluginType> guessPlugins, final int guessParserSampleBufferBytes) {
    // repeat guessing upto 10 times
    ConfigDiff lastGuessed = Exec.newConfigDiff();
    for (int i = 0; i < 10; i++) {
        // include last-guessed config to run guess input
        ConfigSource originalConfig = config.deepCopy().merge(lastGuessed);
        ConfigSource guessInputConfig = originalConfig.deepCopy();
        guessInputConfig.getNestedOrSetEmpty("parser").set("type", // override in.parser.type so that FileInputRunner.run uses GuessParserPlugin
        "system_guess").set("guess_plugins", guessPlugins).set("orig_config", originalConfig).set("guess_parser_sample_buffer_bytes", guessParserSampleBufferBytes);
        // run FileInputPlugin
        final FileInputRunner input = new FileInputRunner(new BufferFileInputPlugin(sample));
        ConfigDiff guessed;
        try {
            input.transaction(guessInputConfig, new InputPlugin.Control() {

                public List<TaskReport> run(TaskSource inputTaskSource, Schema schema, int taskCount) {
                    if (taskCount == 0) {
                        throw new NoSampleException("No input files to guess");
                    }
                    input.run(inputTaskSource, null, 0, new PageOutput() {

                        @Override
                        public void add(Page page) {
                            // TODO exception class
                            throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration");
                        }

                        @Override
                        public void finish() {
                        }

                        @Override
                        public void close() {
                        }
                    });
                    throw new AssertionError("Guess executor must throw GuessedNoticeError");
                }
            });
            throw new AssertionError("Guess executor must throw GuessedNoticeError");
        } catch (GuessedNoticeError error) {
            guessed = lastGuessed.deepCopy().merge(error.getGuessedConfig());
        }
        // merge to the last-guessed config
        if (lastGuessed.equals(guessed)) {
            // not changed
            return lastGuessed;
        }
        lastGuessed = guessed;
    }
    return lastGuessed;
}
Also used : InputPlugin(org.embulk.spi.InputPlugin) FileInputRunner(org.embulk.spi.FileInputRunner) Schema(org.embulk.spi.Schema) Page(org.embulk.spi.Page) ConfigSource(org.embulk.config.ConfigSource) PageOutput(org.embulk.spi.PageOutput) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) ConfigDiff(org.embulk.config.ConfigDiff) TaskSource(org.embulk.config.TaskSource)

Example 3 with Page

use of org.embulk.spi.Page in project embulk by embulk.

the class CsvFormatterPlugin method open.

@Override
public PageOutput open(TaskSource taskSource, final Schema schema, FileOutput output) {
    final PluginTask task = taskSource.loadTask(PluginTask.class);
    final LineEncoder encoder = new LineEncoder(output, task);
    final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
    final char delimiter = task.getDelimiterChar();
    final QuotePolicy quotePolicy = task.getQuotePolicy();
    final char quote = task.getQuoteChar() != '\0' ? task.getQuoteChar() : '"';
    final char escape = task.getEscapeChar().or(quotePolicy == QuotePolicy.NONE ? '\\' : quote);
    final String newlineInField = task.getNewlineInField().getString();
    final String nullString = task.getNullString();
    // create a file
    encoder.nextFile();
    // write header
    if (task.getHeaderLine()) {
        writeHeader(schema, encoder, delimiter, quotePolicy, quote, escape, newlineInField, nullString);
    }
    return new PageOutput() {

        private final PageReader pageReader = new PageReader(schema);

        private final String delimiterString = String.valueOf(delimiter);

        public void add(Page page) {
            pageReader.setPage(page);
            while (pageReader.nextRecord()) {
                schema.visitColumns(new ColumnVisitor() {

                    public void booleanColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            addValue(Boolean.toString(pageReader.getBoolean(column)));
                        } else {
                            addNullString();
                        }
                    }

                    public void longColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            addValue(Long.toString(pageReader.getLong(column)));
                        } else {
                            addNullString();
                        }
                    }

                    public void doubleColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            addValue(Double.toString(pageReader.getDouble(column)));
                        } else {
                            addNullString();
                        }
                    }

                    public void stringColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            addValue(pageReader.getString(column));
                        } else {
                            addNullString();
                        }
                    }

                    public void timestampColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            Timestamp value = pageReader.getTimestamp(column);
                            addValue(timestampFormatters[column.getIndex()].format(value));
                        } else {
                            addNullString();
                        }
                    }

                    public void jsonColumn(Column column) {
                        addDelimiter(column);
                        if (!pageReader.isNull(column)) {
                            Value value = pageReader.getJson(column);
                            addValue(value.toJson());
                        } else {
                            addNullString();
                        }
                    }

                    private void addDelimiter(Column column) {
                        if (column.getIndex() != 0) {
                            encoder.addText(delimiterString);
                        }
                    }

                    private void addValue(String v) {
                        encoder.addText(setEscapeAndQuoteValue(v, delimiter, quotePolicy, quote, escape, newlineInField, nullString));
                    }

                    private void addNullString() {
                        encoder.addText(nullString);
                    }
                });
                encoder.addNewLine();
            }
        }

        public void finish() {
            encoder.finish();
        }

        public void close() {
            encoder.close();
        }
    };
}
Also used : TimestampFormatter(org.embulk.spi.time.TimestampFormatter) LineEncoder(org.embulk.spi.util.LineEncoder) PageReader(org.embulk.spi.PageReader) Page(org.embulk.spi.Page) Timestamp(org.embulk.spi.time.Timestamp) ColumnVisitor(org.embulk.spi.ColumnVisitor) PageOutput(org.embulk.spi.PageOutput) Column(org.embulk.spi.Column) Value(org.msgpack.value.Value)

Example 4 with Page

use of org.embulk.spi.Page in project embulk by embulk.

the class SamplingParserPlugin method runFileInputSampling.

public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig, ConfigSource sampleBufferConfig) {
    final SampleBufferTask sampleBufferTask = sampleBufferConfig.loadConfig(SampleBufferTask.class);
    // override in.parser.type so that FileInputRunner creates SamplingParserPlugin
    ConfigSource samplingInputConfig = inputConfig.deepCopy();
    samplingInputConfig.getNestedOrSetEmpty("parser").set("type", "system_sampling").set("sample_buffer_bytes", sampleBufferTask.getSampleBufferBytes());
    samplingInputConfig.set("decoders", null);
    try {
        runner.transaction(samplingInputConfig, new InputPlugin.Control() {

            public List<TaskReport> run(TaskSource taskSource, Schema schema, int taskCount) {
                if (taskCount == 0) {
                    throw new NoSampleException("No input files to read sample data");
                }
                int maxSize = -1;
                int maxSizeTaskIndex = -1;
                for (int taskIndex = 0; taskIndex < taskCount; taskIndex++) {
                    try {
                        runner.run(taskSource, schema, taskIndex, new PageOutput() {

                            @Override
                            public void add(Page page) {
                                // TODO exception class
                                throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration");
                            }

                            public void finish() {
                            }

                            public void close() {
                            }
                        });
                    } catch (NotEnoughSampleError ex) {
                        if (maxSize < ex.getSize()) {
                            maxSize = ex.getSize();
                            maxSizeTaskIndex = taskIndex;
                        }
                        continue;
                    }
                }
                if (maxSize <= 0) {
                    throw new NoSampleException("All input files are empty");
                }
                taskSource.getNested("ParserTaskSource").set("force", true);
                try {
                    runner.run(taskSource, schema, maxSizeTaskIndex, new PageOutput() {

                        @Override
                        public void add(Page page) {
                            // TODO exception class
                            throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration");
                        }

                        public void finish() {
                        }

                        public void close() {
                        }
                    });
                } catch (NotEnoughSampleError ex) {
                    throw new NoSampleException("All input files are smaller than minimum sampling size");
                }
                throw new NoSampleException("All input files are smaller than minimum sampling size");
            }
        });
        throw new AssertionError("SamplingParserPlugin must throw SampledNoticeError");
    } catch (SampledNoticeError error) {
        return error.getSample();
    }
}
Also used : InputPlugin(org.embulk.spi.InputPlugin) Schema(org.embulk.spi.Schema) Page(org.embulk.spi.Page) ConfigSource(org.embulk.config.ConfigSource) PageOutput(org.embulk.spi.PageOutput) List(java.util.List) TaskSource(org.embulk.config.TaskSource)

Aggregations

Page (org.embulk.spi.Page)4 PageOutput (org.embulk.spi.PageOutput)3 ImmutableList (com.google.common.collect.ImmutableList)2 List (java.util.List)2 ConfigSource (org.embulk.config.ConfigSource)2 TaskSource (org.embulk.config.TaskSource)2 InputPlugin (org.embulk.spi.InputPlugin)2 PageReader (org.embulk.spi.PageReader)2 Schema (org.embulk.spi.Schema)2 ArrayList (java.util.ArrayList)1 ConfigDiff (org.embulk.config.ConfigDiff)1 Column (org.embulk.spi.Column)1 ColumnVisitor (org.embulk.spi.ColumnVisitor)1 FileInputRunner (org.embulk.spi.FileInputRunner)1 Timestamp (org.embulk.spi.time.Timestamp)1 TimestampFormatter (org.embulk.spi.time.TimestampFormatter)1 LineEncoder (org.embulk.spi.util.LineEncoder)1 Value (org.msgpack.value.Value)1