Search in sources :

Example 1 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project crunch by cloudera.

the class SpecificAvroGroupByTest method createPersonAvroFile.

private void createPersonAvroFile(File avroFile) throws IOException {
    Builder person = Person.newBuilder();
    person.setAge(40);
    person.setName("Bob");
    List<CharSequence> siblingNames = Lists.newArrayList();
    siblingNames.add("Bob" + "1");
    siblingNames.add("Bob" + "2");
    person.setSiblingnames(siblingNames);
    FileOutputStream outputStream = new FileOutputStream(avroFile);
    SpecificDatumWriter<Person> writer = new SpecificDatumWriter<Person>(Person.class);
    DataFileWriter<Person> dataFileWriter = new DataFileWriter<Person>(writer);
    dataFileWriter.create(Person.SCHEMA$, outputStream);
    dataFileWriter.append(person.build());
    dataFileWriter.close();
    outputStream.close();
}
Also used : Builder(org.apache.crunch.test.Person.Builder) FileOutputStream(java.io.FileOutputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) Person(org.apache.crunch.test.Person) SpecificDatumWriter(org.apache.avro.specific.SpecificDatumWriter)

Example 2 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project pinot by linkedin.

the class UploadRefreshDeleteIntegrationTest method generateAndUploadRandomSegment1.

protected void generateAndUploadRandomSegment1(final String segmentName, int rowCount) throws Exception {
    ThreadLocalRandom random = ThreadLocalRandom.current();
    Schema schema = new Schema.Parser().parse(new File(TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource("dummy.avsc"))));
    GenericRecord record = new GenericData.Record(schema);
    GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<GenericRecord>(datumWriter);
    final File avroFile = new File(_tmpDir, segmentName + ".avro");
    fileWriter.create(schema, avroFile);
    for (int i = 0; i < rowCount; i++) {
        record.put(0, random.nextInt());
        fileWriter.append(record);
    }
    fileWriter.close();
    final int segmentIndex = Integer.parseInt(segmentName.split("_")[1]);
    final String TAR_GZ_FILE_EXTENTION = ".tar.gz";
    File segmentTarDir = new File(_tarsDir, segmentName);
    buildSegment(segmentTarDir, avroFile, segmentIndex, segmentName, 0);
    String segmentFileName = segmentName;
    for (String name : segmentTarDir.list()) {
        if (name.endsWith(TAR_GZ_FILE_EXTENTION)) {
            segmentFileName = name;
        }
    }
    File file = new File(segmentTarDir, segmentFileName);
    long segmentLength = file.length();
    final File segmentTarDir1 = new File(_tarsDir, segmentName);
    FileUtils.deleteQuietly(segmentTarDir);
    new Thread(new Runnable() {

        @Override
        public void run() {
            try {
                buildSegment(segmentTarDir1, avroFile, segmentIndex, segmentName, 5);
            } catch (Exception e) {
            }
        }
    }).start();
    FileUploadUtils.sendSegmentFile("localhost", "8998", segmentFileName, file, segmentLength, 5, 5);
    avroFile.delete();
    FileUtils.deleteQuietly(segmentTarDir);
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 3 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project pinot by linkedin.

the class UploadRefreshDeleteIntegrationTest method generateAndUploadRandomSegment.

protected void generateAndUploadRandomSegment(String segmentName, int rowCount) throws Exception {
    ThreadLocalRandom random = ThreadLocalRandom.current();
    Schema schema = new Schema.Parser().parse(new File(TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource("dummy.avsc"))));
    GenericRecord record = new GenericData.Record(schema);
    GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<GenericRecord>(datumWriter);
    File avroFile = new File(_tmpDir, segmentName + ".avro");
    fileWriter.create(schema, avroFile);
    for (int i = 0; i < rowCount; i++) {
        record.put(0, random.nextInt());
        fileWriter.append(record);
    }
    fileWriter.close();
    int segmentIndex = Integer.parseInt(segmentName.split("_")[1]);
    File segmentTarDir = new File(_tarsDir, segmentName);
    ensureDirectoryExistsAndIsEmpty(segmentTarDir);
    ExecutorService executor = MoreExecutors.sameThreadExecutor();
    buildSegmentsFromAvro(Collections.singletonList(avroFile), executor, segmentIndex, new File(_segmentsDir, segmentName), segmentTarDir, this.tableName, false, null);
    executor.shutdown();
    executor.awaitTermination(1L, TimeUnit.MINUTES);
    for (String segmentFileName : segmentTarDir.list()) {
        File file = new File(segmentTarDir, segmentFileName);
        FileUploadUtils.sendSegmentFile("localhost", "8998", segmentFileName, file, file.length());
    }
    avroFile.delete();
    FileUtils.deleteQuietly(segmentTarDir);
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ExecutorService(java.util.concurrent.ExecutorService) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 4 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project druid by druid-io.

the class AvroHadoopInputRowParserTest method buildPigAvro.

private static GenericRecord buildPigAvro(GenericRecord datum, String inputStorage, String outputStorage) throws IOException {
    final File tmpDir = Files.createTempDir();
    FileReader<GenericRecord> reader = null;
    PigServer pigServer = null;
    try {
        // 0. write avro object into temp file.
        File someAvroDatumFile = new File(tmpDir, "someAvroDatum.avro");
        DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>());
        dataFileWriter.create(SomeAvroDatum.getClassSchema(), someAvroDatumFile);
        dataFileWriter.append(datum);
        dataFileWriter.close();
        // 1. read avro files into Pig
        pigServer = new PigServer(ExecType.LOCAL);
        pigServer.registerQuery(String.format("A = LOAD '%s' USING %s;", someAvroDatumFile, inputStorage));
        // 2. write new avro file using AvroStorage
        File outputDir = new File(tmpDir, "output");
        pigServer.store("A", String.valueOf(outputDir), outputStorage);
        // 3. read avro object from AvroStorage
        reader = DataFileReader.openReader(new File(outputDir, "part-m-00000.avro"), new GenericDatumReader<GenericRecord>());
        return reader.next();
    } finally {
        if (pigServer != null) {
            pigServer.shutdown();
        }
        Closeables.close(reader, true);
        FileUtils.deleteDirectory(tmpDir);
    }
}
Also used : PigServer(org.apache.pig.PigServer) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 5 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project crunch by cloudera.

the class AvroOutputFormat method getRecordWriter.

@Override
public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    Schema schema = null;
    String outputName = conf.get("crunch.namedoutput");
    if (outputName != null && !outputName.isEmpty()) {
        schema = (new Schema.Parser()).parse(conf.get("avro.output.schema." + outputName));
    } else {
        schema = AvroJob.getOutputSchema(context.getConfiguration());
    }
    ReflectDataFactory factory = Avros.getReflectDataFactory(conf);
    final DataFileWriter<T> WRITER = new DataFileWriter<T>(factory.<T>getWriter());
    Path path = getDefaultWorkFile(context, org.apache.avro.mapred.AvroOutputFormat.EXT);
    WRITER.create(schema, path.getFileSystem(context.getConfiguration()).create(path));
    return new RecordWriter<AvroWrapper<T>, NullWritable>() {

        @Override
        public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException {
            WRITER.append(wrapper.datum());
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            WRITER.close();
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) NullWritable(org.apache.hadoop.io.NullWritable) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) AvroWrapper(org.apache.avro.mapred.AvroWrapper)

Aggregations

DataFileWriter (org.apache.avro.file.DataFileWriter)91 GenericRecord (org.apache.avro.generic.GenericRecord)54 Schema (org.apache.avro.Schema)48 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)41 File (java.io.File)35 ByteArrayOutputStream (java.io.ByteArrayOutputStream)19 IOException (java.io.IOException)19 GenericData (org.apache.avro.generic.GenericData)16 Test (org.junit.Test)14 FileOutputStream (java.io.FileOutputStream)13 HashMap (java.util.HashMap)11 ArrayList (java.util.ArrayList)9 SpecificDatumWriter (org.apache.avro.specific.SpecificDatumWriter)9 Path (org.apache.hadoop.fs.Path)8 InputStream (java.io.InputStream)7 ByteBuffer (java.nio.ByteBuffer)7 MockFlowFile (org.apache.nifi.util.MockFlowFile)7 ByteArrayInputStream (java.io.ByteArrayInputStream)6 Record (org.apache.avro.generic.GenericData.Record)6 OutputStream (java.io.OutputStream)5