Search in sources :

Example 26 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.

the class OrcHadoopInputRowParserTest method testDate1900.

@Test
public void testDate1900() throws IOException {
    /*
      TestOrcFile.testDate1900.orc
      struct<time:timestamp,date:date>
      {1900-05-05 12:34:56.1, 1900-12-25}
     */
    HadoopDruidIndexerConfig config = loadHadoopDruidIndexerConfig("example/testDate1900_hadoop_job.json");
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);
    OrcStruct data = getFirstRow(job, ((StaticPathSpec) config.getPathSpec()).getPaths());
    List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
    Assert.assertEquals(1, rows.get(0).getDimensions().size());
    Assert.assertEquals("1900-12-25T00:00:00.000Z", rows.get(0).getDimension("date").get(0));
    Assert.assertEquals(DateTimes.of("1900-05-05T12:34:56.1Z"), rows.get(0).getTimestamp());
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) InputRow(org.apache.druid.data.input.InputRow) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Example 27 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.

the class OrcHadoopInputRowParserTest method testTest2.

@Test
public void testTest2() throws IOException {
    HadoopDruidIndexerConfig config = loadHadoopDruidIndexerConfig("example/test_2_hadoop_job.json");
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);
    /*
      test_2.orc
      struct<timestamp:string,col1:string,col2:array<string>,col3:float,col4:bigint,col5:decimal,col6:array<string>,col7:map<string,string>>
      {2016-01-01, bar, [dat1, dat2, dat3], 1.1, 2, 3.5, [], {subcol7=subval7}}
     */
    OrcStruct data = getFirstRow(job, ((StaticPathSpec) config.getPathSpec()).getPaths());
    List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
    Assert.assertEquals(7, rows.get(0).getDimensions().size());
    Assert.assertEquals("bar", rows.get(0).getDimension("col1").get(0));
    Assert.assertEquals("dat1", rows.get(0).getDimension("col2").get(0));
    Assert.assertEquals("dat2", rows.get(0).getDimension("col2").get(1));
    Assert.assertEquals("dat3", rows.get(0).getDimension("col2").get(2));
    Assert.assertEquals(1.1f, rows.get(0).getRaw("col3"));
    Assert.assertEquals(2L, rows.get(0).getRaw("col4"));
    Assert.assertEquals(3.5d, rows.get(0).getRaw("col5"));
    Assert.assertEquals(ImmutableList.of(), rows.get(0).getRaw("col6"));
    Assert.assertEquals("subval7", rows.get(0).getRaw("col7-subcol7"));
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) InputRow(org.apache.druid.data.input.InputRow) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Example 28 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.

the class OrcReader method intermediateRowIterator.

@Override
protected CloseableIterator<OrcStruct> intermediateRowIterator() throws IOException {
    final Closer closer = Closer.create();
    // We fetch here to cache a copy locally. However, this might need to be changed if we want to split an orc file
    // into several InputSplits in the future.
    final byte[] buffer = new byte[InputEntity.DEFAULT_FETCH_BUFFER_SIZE];
    final CleanableFile file = closer.register(source.fetch(temporaryDirectory, buffer));
    final Path path = new Path(file.file().toURI());
    final ClassLoader currentClassLoader = Thread.currentThread().getContextClassLoader();
    final Reader reader;
    try {
        Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
        reader = closer.register(OrcFile.createReader(path, OrcFile.readerOptions(conf)));
    } finally {
        Thread.currentThread().setContextClassLoader(currentClassLoader);
    }
    // The below line will get the schmea to read the whole columns.
    // This can be improved by projecting some columns only what users want in the future.
    final TypeDescription schema = reader.getSchema();
    final RecordReader batchReader = reader.rows(reader.options());
    final OrcMapredRecordReader<OrcStruct> recordReader = new OrcMapredRecordReader<>(batchReader, schema);
    closer.register(recordReader::close);
    return new CloseableIterator<OrcStruct>() {

        final NullWritable key = recordReader.createKey();

        OrcStruct value = null;

        @Override
        public boolean hasNext() {
            if (value == null) {
                try {
                    // The returned OrcStruct in next() can be kept in memory for a while.
                    // Here, we create a new instance of OrcStruct before calling RecordReader.next(),
                    // so that we can avoid to share the same reference to the "value" across rows.
                    value = recordReader.createValue();
                    if (!recordReader.next(key, value)) {
                        value = null;
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            return value != null;
        }

        @Override
        public OrcStruct next() {
            if (value == null) {
                throw new NoSuchElementException();
            }
            final OrcStruct currentValue = value;
            value = null;
            return currentValue;
        }

        @Override
        public void close() throws IOException {
            closer.close();
        }
    };
}
Also used : Closer(org.apache.druid.java.util.common.io.Closer) Path(org.apache.hadoop.fs.Path) CloseableIterator(org.apache.druid.java.util.common.parsers.CloseableIterator) RecordReader(org.apache.orc.RecordReader) OrcMapredRecordReader(org.apache.orc.mapred.OrcMapredRecordReader) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) IntermediateRowParsingReader(org.apache.druid.data.input.IntermediateRowParsingReader) OrcMapredRecordReader(org.apache.orc.mapred.OrcMapredRecordReader) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) OrcStruct(org.apache.orc.mapred.OrcStruct) TypeDescription(org.apache.orc.TypeDescription) OrcMapredRecordReader(org.apache.orc.mapred.OrcMapredRecordReader) CleanableFile(org.apache.druid.data.input.InputEntity.CleanableFile) NoSuchElementException(java.util.NoSuchElementException)

Example 29 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.

the class OrcStructConverterTest method testConvertRootFieldWithUnknownFieldNameReturningNull.

@Test
public void testConvertRootFieldWithUnknownFieldNameReturningNull() {
    final Map<String, TypeDescription> types = new HashMap<>();
    types.put("int", TypeDescription.createInt());
    final TypeDescription schema = createRootSchema(types);
    final OrcStruct orcStruct = new OrcStruct(schema);
    orcStruct.setFieldValue("int", new IntWritable(1024));
    final OrcStructConverter converter = new OrcStructConverter(false);
    assertNullValue(converter, orcStruct, "unknownField");
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) HashMap(java.util.HashMap) TypeDescription(org.apache.orc.TypeDescription) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 30 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcCompactionTaskTest method basicTestWithRecompactionAndBasicSchemaEvolution.

@Test
public void basicTestWithRecompactionAndBasicSchemaEvolution() throws Exception {
    File basePath = Files.createTempDir();
    basePath.deleteOnExit();
    String minutelyPath = "Identity/MemberAccount/minutely/2017/04/03/10/20_30/run_2017-04-03-10-20";
    String hourlyPath = "Identity/MemberAccount/hourly/2017/04/03/10/";
    File jobDir = new File(basePath, minutelyPath);
    Assert.assertTrue(jobDir.mkdirs());
    // Writing some basic ORC files
    createTestingData(jobDir);
    // Writing an additional file with ** evolved schema **.
    TypeDescription evolvedSchema = TypeDescription.fromString("struct<i:int,j:int,k:int>");
    OrcStruct orcStruct_4 = (OrcStruct) OrcStruct.createValue(evolvedSchema);
    orcStruct_4.setFieldValue("i", new IntWritable(5));
    orcStruct_4.setFieldValue("j", new IntWritable(6));
    orcStruct_4.setFieldValue("k", new IntWritable(7));
    File file_2 = new File(jobDir, "file_2.1." + extensionName);
    writeOrcRecordsInFile(new Path(file_2.getAbsolutePath()), evolvedSchema, ImmutableList.of(orcStruct_4));
    // Make this is the newest.
    file_2.setLastModified(Long.MAX_VALUE);
    // Verify execution
    // Overwrite the job configurator factory key.
    EmbeddedGobblin embeddedGobblin = createEmbeddedGobblinCompactionJob("basic", basePath.getAbsolutePath()).setConfiguration(CompactionJobConfigurator.COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY, TestCompactionOrcJobConfigurator.Factory.class.getName()).setConfiguration(COMPACTION_OUTPUT_EXTENSION, extensionName).setConfiguration(COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET, "Identity.*:0.1");
    JobExecutionResult execution = embeddedGobblin.run();
    Assert.assertTrue(execution.isSuccessful());
    // Result verification
    File outputDir = new File(basePath, hourlyPath);
    FileSystem fs = FileSystem.getLocal(new Configuration());
    List<FileStatus> statuses = new ArrayList<>();
    reloadFolder(statuses, outputDir, fs);
    Assert.assertTrue(statuses.size() == 1);
    List<OrcStruct> result = readOrcFile(statuses.get(0).getPath());
    Assert.assertEquals(result.size(), 4);
    Assert.assertEquals(result.get(0).getFieldValue("i"), new IntWritable(1));
    Assert.assertEquals(result.get(0).getFieldValue("j"), new IntWritable(2));
    Assert.assertNull(result.get(0).getFieldValue("k"));
    Assert.assertEquals(result.get(1).getFieldValue("i"), new IntWritable(2));
    Assert.assertEquals(result.get(1).getFieldValue("j"), new IntWritable(3));
    Assert.assertNull(result.get(1).getFieldValue("k"));
    Assert.assertEquals(result.get(2).getFieldValue("i"), new IntWritable(4));
    Assert.assertEquals(result.get(2).getFieldValue("j"), new IntWritable(5));
    Assert.assertNull(result.get(2).getFieldValue("k"));
    Assert.assertEquals(result.get(3).getFieldValue("i"), new IntWritable(5));
    Assert.assertEquals(result.get(3).getFieldValue("j"), new IntWritable(6));
    Assert.assertEquals(result.get(3).getFieldValue("k"), new IntWritable(7));
    // Adding new .orc file into the directory and verify if re-compaction is triggered.
    File file_late = new File(jobDir, "file_late.1." + extensionName);
    OrcStruct orcStruct_5 = (OrcStruct) OrcStruct.createValue(evolvedSchema);
    orcStruct_5.setFieldValue("i", new IntWritable(10));
    orcStruct_5.setFieldValue("j", new IntWritable(11));
    orcStruct_5.setFieldValue("k", new IntWritable(12));
    writeOrcRecordsInFile(new Path(file_late.getAbsolutePath()), evolvedSchema, ImmutableList.of(orcStruct_5));
    execution = embeddedGobblin.run();
    Assert.assertTrue(execution.isSuccessful());
    reloadFolder(statuses, outputDir, fs);
    result = readOrcFile(statuses.get(0).getPath());
    // Note previous execution's inspection gives 4 result, given re-compaction, this should gives 1 late-record more.
    Assert.assertEquals(result.size(), 4 + 1);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) JobExecutionResult(org.apache.gobblin.runtime.api.JobExecutionResult) OrcStruct(org.apache.orc.mapred.OrcStruct) FileSystem(org.apache.hadoop.fs.FileSystem) TypeDescription(org.apache.orc.TypeDescription) EmbeddedGobblin(org.apache.gobblin.runtime.embedded.EmbeddedGobblin) OrcFile(org.apache.orc.OrcFile) File(java.io.File) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Aggregations

OrcStruct (org.apache.orc.mapred.OrcStruct)39 TypeDescription (org.apache.orc.TypeDescription)24 Configuration (org.apache.hadoop.conf.Configuration)18 IntWritable (org.apache.hadoop.io.IntWritable)17 Test (org.testng.annotations.Test)15 ArrayList (java.util.ArrayList)13 Test (org.junit.Test)9 OrcFile (org.apache.orc.OrcFile)8 OrcList (org.apache.orc.mapred.OrcList)8 File (java.io.File)7 InputRow (org.apache.druid.data.input.InputRow)7 Path (org.apache.hadoop.fs.Path)7 Job (org.apache.hadoop.mapreduce.Job)7 OrcUnion (org.apache.orc.mapred.OrcUnion)7 ImmutableList (com.google.common.collect.ImmutableList)6 List (java.util.List)6 HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)6 Text (org.apache.hadoop.io.Text)6 OrcKey (org.apache.orc.mapred.OrcKey)5 OrcMap (org.apache.orc.mapred.OrcMap)5