Search in sources :

Example 21 with Schema

use of org.embulk.spi.Schema in project embulk by embulk.

the class SamplingParserPlugin method runFileInputSampling.

public static Buffer runFileInputSampling(final FileInputRunner runner, ConfigSource inputConfig, ConfigSource sampleBufferConfig) {
    final SampleBufferTask sampleBufferTask = sampleBufferConfig.loadConfig(SampleBufferTask.class);
    // override in.parser.type so that FileInputRunner creates SamplingParserPlugin
    ConfigSource samplingInputConfig = inputConfig.deepCopy();
    samplingInputConfig.getNestedOrSetEmpty("parser").set("type", "system_sampling").set("sample_buffer_bytes", sampleBufferTask.getSampleBufferBytes());
    samplingInputConfig.set("decoders", null);
    try {
        runner.transaction(samplingInputConfig, new InputPlugin.Control() {

            public List<TaskReport> run(TaskSource taskSource, Schema schema, int taskCount) {
                if (taskCount == 0) {
                    throw new NoSampleException("No input files to read sample data");
                }
                int maxSize = -1;
                int maxSizeTaskIndex = -1;
                for (int taskIndex = 0; taskIndex < taskCount; taskIndex++) {
                    try {
                        runner.run(taskSource, schema, taskIndex, new PageOutput() {

                            @Override
                            public void add(Page page) {
                                // TODO exception class
                                throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration");
                            }

                            public void finish() {
                            }

                            public void close() {
                            }
                        });
                    } catch (NotEnoughSampleError ex) {
                        if (maxSize < ex.getSize()) {
                            maxSize = ex.getSize();
                            maxSizeTaskIndex = taskIndex;
                        }
                        continue;
                    }
                }
                if (maxSize <= 0) {
                    throw new NoSampleException("All input files are empty");
                }
                taskSource.getNested("ParserTaskSource").set("force", true);
                try {
                    runner.run(taskSource, schema, maxSizeTaskIndex, new PageOutput() {

                        @Override
                        public void add(Page page) {
                            // TODO exception class
                            throw new RuntimeException("Input plugin must be a FileInputPlugin to guess parser configuration");
                        }

                        public void finish() {
                        }

                        public void close() {
                        }
                    });
                } catch (NotEnoughSampleError ex) {
                    throw new NoSampleException("All input files are smaller than minimum sampling size");
                }
                throw new NoSampleException("All input files are smaller than minimum sampling size");
            }
        });
        throw new AssertionError("SamplingParserPlugin must throw SampledNoticeError");
    } catch (SampledNoticeError error) {
        return error.getSample();
    }
}
Also used : InputPlugin(org.embulk.spi.InputPlugin) Schema(org.embulk.spi.Schema) Page(org.embulk.spi.Page) ConfigSource(org.embulk.config.ConfigSource) PageOutput(org.embulk.spi.PageOutput) List(java.util.List) TaskSource(org.embulk.config.TaskSource)

Example 22 with Schema

use of org.embulk.spi.Schema in project MiscellaneousStudy by mikoto2000.

the class MyPageOutput method transaction.

@Override
public void transaction(ConfigSource config, Schema inputSchema, FilterPlugin.Control control) {
    System.out.println("transaction!");
    System.out.print("config: ");
    System.out.println(config);
    System.out.print("inputSchema: ");
    System.out.println(inputSchema);
    System.out.print("control: ");
    System.out.println(control);
    PluginTask task = config.loadConfig(PluginTask.class);
    java.util.List<Column> newSchemaColumns = inputSchema.getColumns();
    Schema.Builder builder = Schema.builder();
    // 連番カラムを追加
    builder.add("lineNumber", Types.LONG);
    for (Column column : newSchemaColumns) {
        builder.add(column.getName(), column.getType());
    }
    // 追加文字列カラムを追加
    builder.add("additional", Types.STRING);
    Schema outputSchema = builder.build();
    control.run(task.dump(), outputSchema);
}
Also used : Column(org.embulk.spi.Column) Schema(org.embulk.spi.Schema)

Aggregations

Schema (org.embulk.spi.Schema)22 TaskSource (org.embulk.config.TaskSource)12 Column (org.embulk.spi.Column)10 ConfigSource (org.embulk.config.ConfigSource)9 ConfigException (org.embulk.config.ConfigException)8 List (java.util.List)7 FilterPlugin (org.embulk.spi.FilterPlugin)7 Test (org.junit.Test)6 ImmutableList (com.google.common.collect.ImmutableList)5 InputPlugin (org.embulk.spi.InputPlugin)5 SchemaConfigException (org.embulk.spi.SchemaConfigException)5 ArrayList (java.util.ArrayList)4 ConfigDiff (org.embulk.config.ConfigDiff)3 PageOutput (org.embulk.spi.PageOutput)3 HashMap (java.util.HashMap)2 TaskReport (org.embulk.config.TaskReport)2 ExecutorPlugin (org.embulk.spi.ExecutorPlugin)2 Page (org.embulk.spi.Page)2 LineDecoder (org.embulk.spi.util.LineDecoder)2 JsonNode (com.fasterxml.jackson.databind.JsonNode)1