Search in sources :

Example 1 with JsonSchemaInferrer

use of org.talend.daikon.avro.inferrer.JsonSchemaInferrer in project components by Talend.

the class PythonRowDoFn method flatMap.

private void flatMap(IndexedRecord input, ProcessContext context) throws IOException {
    // Prepare Python environment
    PyObject outputList = pyFn.__call__(new PyString(input.toString()));
    if (outputList instanceof PyList) {
        PyList list = (PyList) outputList;
        for (Object output : list) {
            if (jsonGenericRecordConverter == null) {
                JsonSchemaInferrer jsonSchemaInferrer = new JsonSchemaInferrer(new ObjectMapper());
                Schema jsonSchema = jsonSchemaInferrer.inferSchema(output.toString());
                jsonGenericRecordConverter = new JsonGenericRecordConverter(jsonSchema);
            }
            GenericRecord outputRecord = jsonGenericRecordConverter.convertToAvro(output.toString());
            context.output(outputRecord);
        }
    }
}
Also used : PyString(org.python.core.PyString) PyList(org.python.core.PyList) Schema(org.apache.avro.Schema) PyObject(org.python.core.PyObject) JsonGenericRecordConverter(org.talend.daikon.avro.converter.JsonGenericRecordConverter) GenericRecord(org.apache.avro.generic.GenericRecord) PyObject(org.python.core.PyObject) JsonSchemaInferrer(org.talend.daikon.avro.inferrer.JsonSchemaInferrer) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Example 2 with JsonSchemaInferrer

use of org.talend.daikon.avro.inferrer.JsonSchemaInferrer in project components by Talend.

the class PythonRowDoFn method map.

private void map(IndexedRecord input, ProcessContext context) throws IOException {
    PyObject output = pyFn.__call__(new PyString(input.toString()));
    if (jsonGenericRecordConverter == null) {
        JsonSchemaInferrer jsonSchemaInferrer = new JsonSchemaInferrer(new ObjectMapper());
        Schema jsonSchema = jsonSchemaInferrer.inferSchema(output.toString());
        jsonGenericRecordConverter = new JsonGenericRecordConverter(jsonSchema);
    }
    GenericRecord outputRecord = jsonGenericRecordConverter.convertToAvro(output.toString());
    context.output(outputRecord);
}
Also used : PyString(org.python.core.PyString) Schema(org.apache.avro.Schema) JsonGenericRecordConverter(org.talend.daikon.avro.converter.JsonGenericRecordConverter) GenericRecord(org.apache.avro.generic.GenericRecord) PyObject(org.python.core.PyObject) JsonSchemaInferrer(org.talend.daikon.avro.inferrer.JsonSchemaInferrer) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Example 3 with JsonSchemaInferrer

use of org.talend.daikon.avro.inferrer.JsonSchemaInferrer in project components by Talend.

the class FixedDatasetRuntime method getValues.

public List<IndexedRecord> getValues(int limit) {
    List<IndexedRecord> values = new ArrayList<>();
    switch(properties.format.getValue()) {
        case CSV:
            try {
                CsvRecordToIndexedRecordConverter converter = new CsvRecordToIndexedRecordConverter(getSchema());
                for (CSVRecord r : // 
                CSVFormat.RFC4180.withDelimiter(// 
                properties.getFieldDelimiter().charAt(0)).withRecordSeparator(properties.getRecordDelimiter()).parse(new StringReader(properties.values.getValue()))) values.add(converter.convertToAvro(r));
            } catch (IOException e) {
                throw LocalIOErrorCode.createCannotParseSchema(e, properties.values.getValue());
            }
            break;
        case JSON:
            ObjectMapper mapper = new ObjectMapper();
            JsonSchemaInferrer jsonSchemaInferrer = new JsonSchemaInferrer(mapper);
            JsonGenericRecordConverter converter = null;
            JsonFactory jsonFactory = new JsonFactory();
            try (StringReader r = new StringReader(properties.values.getValue())) {
                Iterator<JsonNode> value = mapper.readValues(jsonFactory.createParser(r), JsonNode.class);
                int count = 0;
                while (value.hasNext() && count++ < limit) {
                    String json = value.next().toString();
                    if (converter == null) {
                        Schema jsonSchema = jsonSchemaInferrer.inferSchema(json);
                        converter = new JsonGenericRecordConverter(jsonSchema);
                    }
                    values.add(converter.convertToAvro(json));
                }
            } catch (IOException e) {
                throw LocalIOErrorCode.createCannotParseJson(e, properties.schema.getValue(), properties.values.getValue());
            }
            break;
        case AVRO:
            Schema schema = getSchema();
            if (isRandom()) {
                GeneratorFunction<IndexedRecord> gf = (GeneratorFunction<IndexedRecord>) GeneratorFunctions.of(getSchema());
                GeneratorFunction.GeneratorContext ctx = GeneratorFunction.GeneratorContext.of(0, 0L);
                for (int i = 0; i < limit; i++) {
                    ctx.setRowId(i);
                    values.add(gf.apply(ctx));
                }
            } else {
                try (ByteArrayInputStream bais = new ByteArrayInputStream(properties.values.getValue().trim().getBytes())) {
                    JsonDecoder decoder = DecoderFactory.get().jsonDecoder(schema, bais);
                    DatumReader<IndexedRecord> reader = new GenericDatumReader<>(schema);
                    int count = 0;
                    while (count++ < limit) {
                        values.add(reader.read(null, decoder));
                    }
                } catch (EOFException e) {
                // Indicates the end of the values.
                } catch (IOException e) {
                    throw LocalIOErrorCode.createCannotParseAvroJson(e, properties.schema.getValue(), properties.values.getValue());
                }
            }
            break;
    }
    return values;
}
Also used : IndexedRecord(org.apache.avro.generic.IndexedRecord) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) JsonFactory(com.fasterxml.jackson.core.JsonFactory) JsonNode(com.fasterxml.jackson.databind.JsonNode) IOException(java.io.IOException) JsonDecoder(org.apache.avro.io.JsonDecoder) ByteArrayInputStream(java.io.ByteArrayInputStream) StringReader(java.io.StringReader) EOFException(java.io.EOFException) CSVRecord(org.apache.commons.csv.CSVRecord) JsonGenericRecordConverter(org.talend.daikon.avro.converter.JsonGenericRecordConverter) GeneratorFunction(org.talend.components.adapter.beam.io.rowgenerator.GeneratorFunction) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JsonSchemaInferrer(org.talend.daikon.avro.inferrer.JsonSchemaInferrer)

Example 4 with JsonSchemaInferrer

use of org.talend.daikon.avro.inferrer.JsonSchemaInferrer in project components by Talend.

the class ElasticsearchInputRuntime method expand.

@Override
public PCollection<IndexedRecord> expand(PBegin in) {
    ElasticsearchIO.Read esRead = ElasticsearchIO.read().withConnectionConfiguration(createConnectionConf(properties.getDatasetProperties()));
    if (properties.query.getValue() != null) {
        esRead = esRead.withQuery(properties.query.getValue());
    }
    PCollection<String> readFromElasticsearch = in.apply(esRead);
    PCollection<IndexedRecord> elasticsearchDataAsAvro = readFromElasticsearch.apply("DocumentToIndexedRecord", ParDo.of(new DoFn<String, IndexedRecord>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            if (jsonGenericRecordConverter == null) {
                JsonSchemaInferrer jsonSchemaInferrer = new JsonSchemaInferrer(new ObjectMapper());
                Schema jsonSchema = jsonSchemaInferrer.inferSchema(c.element().toString());
                jsonGenericRecordConverter = new JsonGenericRecordConverter(jsonSchema);
            }
            GenericRecord outputRecord = jsonGenericRecordConverter.convertToAvro(c.element().toString());
            c.output(outputRecord);
        }
    }));
    return elasticsearchDataAsAvro;
}
Also used : ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) Schema(org.apache.avro.Schema) DoFn(org.apache.beam.sdk.transforms.DoFn) ElasticsearchIO(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO) JsonGenericRecordConverter(org.talend.daikon.avro.converter.JsonGenericRecordConverter) GenericRecord(org.apache.avro.generic.GenericRecord) JsonSchemaInferrer(org.talend.daikon.avro.inferrer.JsonSchemaInferrer) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Aggregations

ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)4 Schema (org.apache.avro.Schema)4 JsonGenericRecordConverter (org.talend.daikon.avro.converter.JsonGenericRecordConverter)4 JsonSchemaInferrer (org.talend.daikon.avro.inferrer.JsonSchemaInferrer)4 GenericRecord (org.apache.avro.generic.GenericRecord)3 IndexedRecord (org.apache.avro.generic.IndexedRecord)2 PyObject (org.python.core.PyObject)2 PyString (org.python.core.PyString)2 JsonFactory (com.fasterxml.jackson.core.JsonFactory)1 JsonNode (com.fasterxml.jackson.databind.JsonNode)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 EOFException (java.io.EOFException)1 IOException (java.io.IOException)1 StringReader (java.io.StringReader)1 ArrayList (java.util.ArrayList)1 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)1 JsonDecoder (org.apache.avro.io.JsonDecoder)1 ElasticsearchIO (org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO)1 DoFn (org.apache.beam.sdk.transforms.DoFn)1 CSVRecord (org.apache.commons.csv.CSVRecord)1