Search in sources :

Example 6 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project crunch by cloudera.

the class AvroFileSourceTargetTest method populateGenericFile.

private void populateGenericFile(List<GenericRecord> genericRecords, Schema schema) throws IOException {
    FileOutputStream outputStream = new FileOutputStream(this.avroFile);
    GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(genericDatumWriter);
    dataFileWriter.create(schema, outputStream);
    for (GenericRecord record : genericRecords) {
        dataFileWriter.append(record);
    }
    dataFileWriter.close();
    outputStream.close();
}
Also used : FileOutputStream(java.io.FileOutputStream) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 7 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project druid by druid-io.

the class AvroHadoopInputRowParserTest method buildPigAvro.

private static GenericRecord buildPigAvro(GenericRecord datum, String inputStorage, String outputStorage) throws IOException {
    final File tmpDir = Files.createTempDir();
    FileReader<GenericRecord> reader = null;
    PigServer pigServer = null;
    try {
        // 0. write avro object into temp file.
        File someAvroDatumFile = new File(tmpDir, "someAvroDatum.avro");
        DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>());
        dataFileWriter.create(SomeAvroDatum.getClassSchema(), someAvroDatumFile);
        dataFileWriter.append(datum);
        dataFileWriter.close();
        // 1. read avro files into Pig
        pigServer = new PigServer(ExecType.LOCAL);
        pigServer.registerQuery(String.format("A = LOAD '%s' USING %s;", someAvroDatumFile, inputStorage));
        // 2. write new avro file using AvroStorage
        File outputDir = new File(tmpDir, "output");
        pigServer.store("A", String.valueOf(outputDir), outputStorage);
        // 3. read avro object from AvroStorage
        reader = DataFileReader.openReader(new File(outputDir, "part-m-00000.avro"), new GenericDatumReader<GenericRecord>());
        return reader.next();
    } finally {
        if (pigServer != null) {
            pigServer.shutdown();
        }
        Closeables.close(reader, true);
        FileUtils.deleteDirectory(tmpDir);
    }
}
Also used : PigServer(org.apache.pig.PigServer) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 8 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project haivvreo by jghoman.

the class AvroContainerOutputFormat method getHiveRecordWriter.

@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jobConf, Path path, Class<? extends Writable> valueClass, boolean isCompressed, Properties properties, Progressable progressable) throws IOException {
    Schema schema;
    try {
        schema = HaivvreoUtils.determineSchemaOrThrowException(jobConf, properties);
    } catch (HaivvreoException e) {
        throw new IOException(e);
    }
    GenericDatumWriter<GenericRecord> gdw = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> dfw = new DataFileWriter<GenericRecord>(gdw);
    if (isCompressed) {
        int level = jobConf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
        String codecName = jobConf.get(OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName);
        dfw.setCodec(factory);
    }
    dfw.create(schema, path.getFileSystem(jobConf).create(path));
    return new AvroGenericRecordWriter(dfw);
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) IOException(java.io.IOException) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) CodecFactory(org.apache.avro.file.CodecFactory) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 9 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project pinot by linkedin.

the class UploadRefreshDeleteIntegrationTest method generateAndUploadRandomSegment1.

protected void generateAndUploadRandomSegment1(final String segmentName, int rowCount) throws Exception {
    ThreadLocalRandom random = ThreadLocalRandom.current();
    Schema schema = new Schema.Parser().parse(new File(TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource("dummy.avsc"))));
    GenericRecord record = new GenericData.Record(schema);
    GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<GenericRecord>(datumWriter);
    final File avroFile = new File(_tmpDir, segmentName + ".avro");
    fileWriter.create(schema, avroFile);
    for (int i = 0; i < rowCount; i++) {
        record.put(0, random.nextInt());
        fileWriter.append(record);
    }
    fileWriter.close();
    final int segmentIndex = Integer.parseInt(segmentName.split("_")[1]);
    final String TAR_GZ_FILE_EXTENTION = ".tar.gz";
    File segmentTarDir = new File(_tarsDir, segmentName);
    buildSegment(segmentTarDir, avroFile, segmentIndex, segmentName, 0);
    String segmentFileName = segmentName;
    for (String name : segmentTarDir.list()) {
        if (name.endsWith(TAR_GZ_FILE_EXTENTION)) {
            segmentFileName = name;
        }
    }
    File file = new File(segmentTarDir, segmentFileName);
    long segmentLength = file.length();
    final File segmentTarDir1 = new File(_tarsDir, segmentName);
    FileUtils.deleteQuietly(segmentTarDir);
    new Thread(new Runnable() {

        @Override
        public void run() {
            try {
                buildSegment(segmentTarDir1, avroFile, segmentIndex, segmentName, 5);
            } catch (Exception e) {
            }
        }
    }).start();
    FileUploadUtils.sendSegmentFile("localhost", "8998", segmentFileName, file, segmentLength, 5, 5);
    avroFile.delete();
    FileUtils.deleteQuietly(segmentTarDir);
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 10 with DataFileWriter

use of org.apache.avro.file.DataFileWriter in project pinot by linkedin.

the class UploadRefreshDeleteIntegrationTest method generateAndUploadRandomSegment.

protected void generateAndUploadRandomSegment(String segmentName, int rowCount) throws Exception {
    ThreadLocalRandom random = ThreadLocalRandom.current();
    Schema schema = new Schema.Parser().parse(new File(TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource("dummy.avsc"))));
    GenericRecord record = new GenericData.Record(schema);
    GenericDatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<GenericRecord>(datumWriter);
    File avroFile = new File(_tmpDir, segmentName + ".avro");
    fileWriter.create(schema, avroFile);
    for (int i = 0; i < rowCount; i++) {
        record.put(0, random.nextInt());
        fileWriter.append(record);
    }
    fileWriter.close();
    int segmentIndex = Integer.parseInt(segmentName.split("_")[1]);
    File segmentTarDir = new File(_tarsDir, segmentName);
    ensureDirectoryExistsAndIsEmpty(segmentTarDir);
    ExecutorService executor = MoreExecutors.sameThreadExecutor();
    buildSegmentsFromAvro(Collections.singletonList(avroFile), executor, segmentIndex, new File(_segmentsDir, segmentName), segmentTarDir, this.tableName, false, null);
    executor.shutdown();
    executor.awaitTermination(1L, TimeUnit.MINUTES);
    for (String segmentFileName : segmentTarDir.list()) {
        File file = new File(segmentTarDir, segmentFileName);
        FileUploadUtils.sendSegmentFile("localhost", "8998", segmentFileName, file, file.length());
    }
    avroFile.delete();
    FileUtils.deleteQuietly(segmentTarDir);
}
Also used : Schema(org.apache.avro.Schema) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) ExecutorService(java.util.concurrent.ExecutorService) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Aggregations

DataFileWriter (org.apache.avro.file.DataFileWriter)34 Schema (org.apache.avro.Schema)21 GenericRecord (org.apache.avro.generic.GenericRecord)21 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)17 File (java.io.File)14 FileOutputStream (java.io.FileOutputStream)7 SpecificDatumWriter (org.apache.avro.specific.SpecificDatumWriter)7 ByteArrayOutputStream (java.io.ByteArrayOutputStream)5 IOException (java.io.IOException)4 GenericData (org.apache.avro.generic.GenericData)4 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 DataFileStream (org.apache.avro.file.DataFileStream)3 Person (org.apache.crunch.test.Person)3 Test (org.junit.Test)3 ByteArrayInputStream (java.io.ByteArrayInputStream)2 Random (java.util.Random)2 ThreadLocalRandom (java.util.concurrent.ThreadLocalRandom)2 CodecFactory (org.apache.avro.file.CodecFactory)2