Examples with InputFormat - org.apache.hadoop.mapred.InputFormat

Example 21 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class FetchOperator method getInputFormatFromCache.

@SuppressWarnings("unchecked")
static InputFormat getInputFormatFromCache(Class<? extends InputFormat> inputFormatClass, JobConf conf) throws IOException {
    if (Configurable.class.isAssignableFrom(inputFormatClass) || JobConfigurable.class.isAssignableFrom(inputFormatClass)) {
        return ReflectionUtil.newInstance(inputFormatClass, conf);
    }
    // TODO: why is this copy-pasted from HiveInputFormat?
    InputFormat format = inputFormats.get(inputFormatClass.getName());
    if (format == null) {
        try {
            format = ReflectionUtil.newInstance(inputFormatClass, conf);
            inputFormats.put(inputFormatClass.getName(), format);
        } catch (Exception e) {
            throw new IOException("Cannot create an instance of InputFormat class " + inputFormatClass.getName() + " as specified in mapredWork!", e);
        }
    }
    return format;
}

Also used : JobConfigurable(org.apache.hadoop.mapred.JobConfigurable) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) Configurable(org.apache.hadoop.conf.Configurable) JobConfigurable(org.apache.hadoop.mapred.JobConfigurable) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException)

Example 22 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class TestOrcRawRecordMerger method testRecordReaderOldBaseAndDelta.

/**
   * Test the OrcRecordUpdater with the OrcRawRecordMerger when there is
   * a base and a delta.
   * @throws Exception
   */
@Test
public void testRecordReaderOldBaseAndDelta() throws Exception {
    final int BUCKET = 10;
    Configuration conf = new Configuration();
    OrcOutputFormat of = new OrcOutputFormat();
    FileSystem fs = FileSystem.getLocal(conf);
    Path root = new Path(tmpDir, "testOldBaseAndDelta").makeQualified(fs);
    fs.delete(root, true);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // write the base
    MemoryManager mgr = new MemoryManager(conf) {

        int rowsAddedSinceCheck = 0;

        @Override
        public synchronized void addedRow(int rows) throws IOException {
            rowsAddedSinceCheck += rows;
            if (rowsAddedSinceCheck >= 2) {
                notifyWriters();
                rowsAddedSinceCheck = 0;
            }
        }
    };
    // make 5 stripes with 2 rows each
    Writer writer = OrcFile.createWriter(new Path(root, "0000010_0"), OrcFile.writerOptions(conf).inspector(inspector).fileSystem(fs).blockPadding(false).bufferSize(10000).compress(CompressionKind.NONE).stripeSize(1).memory(mgr).batchSize(2).version(OrcFile.Version.V_0_11));
    String[] values = new String[] { "ignore.1", "0.1", "ignore.2", "ignore.3", "2.0", "2.1", "3.0", "ignore.4", "ignore.5", "ignore.6" };
    for (int i = 0; i < values.length; ++i) {
        writer.addRow(new BigRow(i, i, values[i], i, i));
    }
    writer.close();
    // write a delta
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).writingBase(false).minimumTransactionId(1).maximumTransactionId(1).bucket(BUCKET).inspector(inspector).filesystem(fs).recordIdColumn(5).finalDestination(root);
    RecordUpdater ru = of.getRecordUpdater(root, options);
    values = new String[] { "0.0", null, null, "1.1", null, null, null, "ignore.7" };
    for (int i = 0; i < values.length; ++i) {
        if (values[i] != null) {
            ru.update(1, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
        }
    }
    ru.delete(100, new BigRow(9, 0, BUCKET));
    ru.close(false);
    // write a delta
    options = options.minimumTransactionId(2).maximumTransactionId(2);
    ru = of.getRecordUpdater(root, options);
    values = new String[] { null, null, "1.0", null, null, null, null, "3.1" };
    for (int i = 0; i < values.length; ++i) {
        if (values[i] != null) {
            ru.update(2, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
        }
    }
    ru.delete(100, new BigRow(8, 0, BUCKET));
    ru.close(false);
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
    HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(5, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
    // loop through the 5 splits and read each
    for (int i = 0; i < 4; ++i) {
        System.out.println("starting split " + i);
        rr = inf.getRecordReader(splits[i], job, Reporter.NULL);
        NullWritable key = rr.createKey();
        OrcStruct value = rr.createValue();
        // there should be exactly two rows per a split
        for (int j = 0; j < 2; ++j) {
            System.out.println("i = " + i + ", j = " + j);
            assertEquals(true, rr.next(key, value));
            System.out.println("record = " + value);
            assertEquals(i + "." + j, value.getFieldValue(2).toString());
        }
        assertEquals(false, rr.next(key, value));
    }
    rr = inf.getRecordReader(splits[4], job, Reporter.NULL);
    assertEquals(false, rr.next(rr.createKey(), rr.createValue()));
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) MemoryManager(org.apache.orc.impl.MemoryManager) NullWritable(org.apache.hadoop.io.NullWritable) InputFormat(org.apache.hadoop.mapred.InputFormat) Test(org.junit.Test)

Example 23 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class HiveSplitGenerator method initialize.

@SuppressWarnings("unchecked")
@Override
public List<Event> initialize() throws Exception {
    // Setup the map work for this thread. Pruning modified the work instance to potentially remove
    // partitions. The same work instance must be used when generating splits.
    Utilities.setMapWork(jobConf, work);
    try {
        boolean sendSerializedEvents = conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true);
        // perform dynamic partition pruning
        if (pruner != null) {
            pruner.prune();
        }
        InputSplitInfoMem inputSplitInfo = null;
        boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS);
        LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits);
        String realInputFormatName = conf.get("mapred.input.format.class");
        boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
        if (groupingEnabled) {
            // Need to instantiate the realInputFormat
            InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName), jobConf);
            int totalResource = 0;
            int taskResource = 0;
            int availableSlots = 0;
            // FIXME. Do the right thing Luke.
            if (getContext() == null) {
                // for now, totalResource = taskResource for llap
                availableSlots = 1;
            }
            if (getContext() != null) {
                totalResource = getContext().getTotalAvailableResource().getMemory();
                taskResource = getContext().getVertexTaskResource().getMemory();
                availableSlots = totalResource / taskResource;
            }
            if (HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1) <= 1) {
                // broken configuration from mapred-default.xml
                final long blockSize = conf.getLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT);
                final long minGrouping = conf.getLong(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE_DEFAULT);
                final long preferredSplitSize = Math.min(blockSize / 2, minGrouping);
                HiveConf.setLongVar(jobConf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, preferredSplitSize);
                LOG.info("The preferred split size is " + preferredSplitSize);
            }
            // Create the un-grouped splits
            float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
            // Raw splits
            InputSplit[] splits = inputFormat.getSplits(jobConf, (int) (availableSlots * waves));
            // Sort the splits, so that subsequent grouping is consistent.
            Arrays.sort(splits, new InputSplitComparator());
            LOG.info("Number of input splits: " + splits.length + ". " + availableSlots + " available slots, " + waves + " waves. Input format is: " + realInputFormatName);
            if (work.getIncludedBuckets() != null) {
                splits = pruneBuckets(work, splits);
            }
            Multimap<Integer, InputSplit> groupedSplits = splitGrouper.generateGroupedSplits(jobConf, conf, splits, waves, availableSlots, splitLocationProvider);
            // And finally return them in a flat array
            InputSplit[] flatSplits = groupedSplits.values().toArray(new InputSplit[0]);
            LOG.info("Number of split groups: " + flatSplits.length);
            List<TaskLocationHint> locationHints = splitGrouper.createTaskLocationHints(flatSplits, generateConsistentSplits);
            inputSplitInfo = new InputSplitInfoMem(flatSplits, locationHints, flatSplits.length, null, jobConf);
        } else {
            // If this is used in the future - make sure to disable grouping in the payload, if it isn't already disabled
            throw new RuntimeException("HiveInputFormat does not support non-grouped splits, InputFormatName is: " + realInputFormatName);
        // inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, false, 0);
        }
        return createEventList(sendSerializedEvents, inputSplitInfo);
    } finally {
        Utilities.clearWork(jobConf);
    }
}

Also used : TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) InputSplitInfoMem(org.apache.tez.mapreduce.hadoop.InputSplitInfoMem) InputFormat(org.apache.hadoop.mapred.InputFormat) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 24 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project hive by apache.

the class SemanticAnalyzer method getMetaData.

@SuppressWarnings("nls")
private void getMetaData(QB qb, ReadEntity parentInput) throws HiveException {
    LOG.info("Get metadata for source tables");
    // Go over the tables and populate the related structures.
    // We have to materialize the table alias list since we might
    // modify it in the middle for view rewrite.
    List<String> tabAliases = new ArrayList<String>(qb.getTabAliases());
    // Keep track of view alias to view name and read entity
    // For eg: for a query like 'select * from V3', where V3 -> V2, V2 -> V1, V1 -> T
    // keeps track of full view name and read entity corresponding to alias V3, V3:V2, V3:V2:V1.
    // This is needed for tracking the dependencies for inputs, along with their parents.
    Map<String, ObjectPair<String, ReadEntity>> aliasToViewInfo = new HashMap<String, ObjectPair<String, ReadEntity>>();
    /*
     * used to capture view to SQ conversions. This is used to check for
     * recursive CTE invocations.
     */
    Map<String, String> sqAliasToCTEName = new HashMap<String, String>();
    for (String alias : tabAliases) {
        String tabName = qb.getTabNameForAlias(alias);
        String cteName = tabName.toLowerCase();
        Table tab = db.getTable(tabName, false);
        if (tab == null || tab.getDbName().equals(SessionState.get().getCurrentDatabase())) {
            Table materializedTab = ctx.getMaterializedTable(cteName);
            if (materializedTab == null) {
                // we first look for this alias from CTE, and then from catalog.
                CTEClause cte = findCTEFromName(qb, cteName);
                if (cte != null) {
                    if (!cte.materialize) {
                        addCTEAsSubQuery(qb, cteName, alias);
                        sqAliasToCTEName.put(alias, cteName);
                        continue;
                    }
                    tab = materializeCTE(cteName, cte);
                }
            } else {
                tab = materializedTab;
            }
        }
        if (tab == null) {
            ASTNode src = qb.getParseInfo().getSrcForAlias(alias);
            if (null != src) {
                throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(src));
            } else {
                throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(alias));
            }
        }
        if (tab.isView()) {
            if (qb.getParseInfo().isAnalyzeCommand()) {
                throw new SemanticException(ErrorMsg.ANALYZE_VIEW.getMsg());
            }
            String fullViewName = tab.getDbName() + "." + tab.getTableName();
            // Prevent view cycles
            if (viewsExpanded.contains(fullViewName)) {
                throw new SemanticException("Recursive view " + fullViewName + " detected (cycle: " + StringUtils.join(viewsExpanded, " -> ") + " -> " + fullViewName + ").");
            }
            replaceViewReferenceWithDefinition(qb, tab, tabName, alias);
            // If the view is Inside another view, it should have at least one parent
            if (qb.isInsideView() && parentInput == null) {
                parentInput = PlanUtils.getParentViewInfo(getAliasId(alias, qb), viewAliasToInput);
            }
            ReadEntity viewInput = new ReadEntity(tab, parentInput, !qb.isInsideView());
            viewInput = PlanUtils.addInput(inputs, viewInput);
            aliasToViewInfo.put(alias, new ObjectPair<String, ReadEntity>(fullViewName, viewInput));
            String aliasId = getAliasId(alias, qb);
            if (aliasId != null) {
                aliasId = aliasId.replace(SemanticAnalyzer.SUBQUERY_TAG_1, "").replace(SemanticAnalyzer.SUBQUERY_TAG_2, "");
            }
            viewAliasToInput.put(aliasId, viewInput);
            continue;
        }
        if (!InputFormat.class.isAssignableFrom(tab.getInputFormatClass())) {
            throw new SemanticException(generateErrorMessage(qb.getParseInfo().getSrcForAlias(alias), ErrorMsg.INVALID_INPUT_FORMAT_TYPE.getMsg()));
        }
        qb.getMetaData().setSrcForAlias(alias, tab);
        if (qb.getParseInfo().isAnalyzeCommand()) {
            // allow partial partition specification for nonscan since noscan is fast.
            TableSpec ts = new TableSpec(db, conf, (ASTNode) ast.getChild(0), true, this.noscan);
            if (ts.specType == SpecType.DYNAMIC_PARTITION) {
                // dynamic partitions
                try {
                    ts.partitions = db.getPartitionsByNames(ts.tableHandle, ts.partSpec);
                } catch (HiveException e) {
                    throw new SemanticException(generateErrorMessage(qb.getParseInfo().getSrcForAlias(alias), "Cannot get partitions for " + ts.partSpec), e);
                }
            }
            // validate partial scan command
            QBParseInfo qbpi = qb.getParseInfo();
            if (qbpi.isPartialScanAnalyzeCommand()) {
                Class<? extends InputFormat> inputFormatClass = null;
                switch(ts.specType) {
                    case TABLE_ONLY:
                    case DYNAMIC_PARTITION:
                        inputFormatClass = ts.tableHandle.getInputFormatClass();
                        break;
                    case STATIC_PARTITION:
                        inputFormatClass = ts.partHandle.getInputFormatClass();
                        break;
                    default:
                        assert false;
                }
                // throw a HiveException for formats other than rcfile or orcfile.
                if (!(inputFormatClass.equals(RCFileInputFormat.class) || inputFormatClass.equals(OrcInputFormat.class))) {
                    throw new SemanticException(ErrorMsg.ANALYZE_TABLE_PARTIALSCAN_NON_RCFILE.getMsg());
                }
            }
            tab.setTableSpec(ts);
            qb.getParseInfo().addTableSpec(alias, ts);
        }
        ReadEntity parentViewInfo = PlanUtils.getParentViewInfo(getAliasId(alias, qb), viewAliasToInput);
        // Temporary tables created during the execution are not the input sources
        if (!PlanUtils.isValuesTempTable(alias)) {
            PlanUtils.addInput(inputs, new ReadEntity(tab, parentViewInfo, parentViewInfo == null), mergeIsDirect);
        }
    }
    LOG.info("Get metadata for subqueries");
    // Go over the subqueries and getMetaData for these
    for (String alias : qb.getSubqAliases()) {
        boolean wasView = aliasToViewInfo.containsKey(alias);
        boolean wasCTE = sqAliasToCTEName.containsKey(alias);
        ReadEntity newParentInput = null;
        if (wasView) {
            viewsExpanded.add(aliasToViewInfo.get(alias).getFirst());
            newParentInput = aliasToViewInfo.get(alias).getSecond();
        } else if (wasCTE) {
            ctesExpanded.add(sqAliasToCTEName.get(alias));
        }
        QBExpr qbexpr = qb.getSubqForAlias(alias);
        getMetaData(qbexpr, newParentInput);
        if (wasView) {
            viewsExpanded.remove(viewsExpanded.size() - 1);
        } else if (wasCTE) {
            ctesExpanded.remove(ctesExpanded.size() - 1);
        }
    }
    RowFormatParams rowFormatParams = new RowFormatParams();
    StorageFormat storageFormat = new StorageFormat(conf);
    LOG.info("Get metadata for destination tables");
    // Go over all the destination structures and populate the related
    // metadata
    QBParseInfo qbp = qb.getParseInfo();
    for (String name : qbp.getClauseNamesForDest()) {
        ASTNode ast = qbp.getDestForClause(name);
        switch(ast.getToken().getType()) {
            case HiveParser.TOK_TAB:
                {
                    TableSpec ts = new TableSpec(db, conf, ast);
                    if (ts.tableHandle.isView() || ts.tableHandle.isMaterializedView()) {
                        throw new SemanticException(ErrorMsg.DML_AGAINST_VIEW.getMsg());
                    }
                    Class<?> outputFormatClass = ts.tableHandle.getOutputFormatClass();
                    if (!ts.tableHandle.isNonNative() && !HiveOutputFormat.class.isAssignableFrom(outputFormatClass)) {
                        throw new SemanticException(ErrorMsg.INVALID_OUTPUT_FORMAT_TYPE.getMsg(ast, "The class is " + outputFormatClass.toString()));
                    }
                    boolean isTableWrittenTo = qb.getParseInfo().isInsertIntoTable(ts.tableHandle.getDbName(), ts.tableHandle.getTableName());
                    isTableWrittenTo |= (qb.getParseInfo().getInsertOverwriteTables().get(getUnescapedName((ASTNode) ast.getChild(0), ts.tableHandle.getDbName())) != null);
                    assert isTableWrittenTo : "Inconsistent data structure detected: we are writing to " + ts.tableHandle + " in " + name + " but it's not in isInsertIntoTable() or getInsertOverwriteTables()";
                    // Disallow update and delete on non-acid tables
                    boolean isAcid = AcidUtils.isAcidTable(ts.tableHandle);
                    if ((updating(name) || deleting(name)) && !isAcid) {
                        // here, it means the table itself doesn't support it.
                        throw new SemanticException(ErrorMsg.ACID_OP_ON_NONACID_TABLE, ts.tableName);
                    }
                    // but whether the table itself is partitioned is not know.
                    if (ts.specType != SpecType.STATIC_PARTITION) {
                        // This is a table or dynamic partition
                        qb.getMetaData().setDestForAlias(name, ts.tableHandle);
                        // has dynamic as well as static partitions
                        if (ts.partSpec != null && ts.partSpec.size() > 0) {
                            qb.getMetaData().setPartSpecForAlias(name, ts.partSpec);
                        }
                    } else {
                        // This is a partition
                        qb.getMetaData().setDestForAlias(name, ts.partHandle);
                    }
                    if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
                        // Add the table spec for the destination table.
                        qb.getParseInfo().addTableSpec(ts.tableName.toLowerCase(), ts);
                    }
                    break;
                }
            case HiveParser.TOK_DIR:
                {
                    // This is a dfs file
                    String fname = stripQuotes(ast.getChild(0).getText());
                    if ((!qb.getParseInfo().getIsSubQ()) && (((ASTNode) ast.getChild(0)).getToken().getType() == HiveParser.TOK_TMP_FILE)) {
                        if (qb.isCTAS() || qb.isMaterializedView()) {
                            qb.setIsQuery(false);
                            ctx.setResDir(null);
                            ctx.setResFile(null);
                            Path location;
                            // If the CTAS query does specify a location, use the table location, else use the db location
                            if (qb.getTableDesc() != null && qb.getTableDesc().getLocation() != null) {
                                location = new Path(qb.getTableDesc().getLocation());
                            } else {
                                // allocate a temporary output dir on the location of the table
                                String tableName = getUnescapedName((ASTNode) ast.getChild(0));
                                String[] names = Utilities.getDbTableName(tableName);
                                try {
                                    Warehouse wh = new Warehouse(conf);
                                    //Use destination table's db location.
                                    String destTableDb = qb.getTableDesc() != null ? qb.getTableDesc().getDatabaseName() : null;
                                    if (destTableDb == null) {
                                        destTableDb = names[0];
                                    }
                                    location = wh.getDatabasePath(db.getDatabase(destTableDb));
                                } catch (MetaException e) {
                                    throw new SemanticException(e);
                                }
                            }
                            try {
                                fname = ctx.getExtTmpPathRelTo(FileUtils.makeQualified(location, conf)).toString();
                            } catch (Exception e) {
                                throw new SemanticException(generateErrorMessage(ast, "Error creating temporary folder on: " + location.toString()), e);
                            }
                            if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
                                TableSpec ts = new TableSpec(db, conf, this.ast);
                                // Add the table spec for the destination table.
                                qb.getParseInfo().addTableSpec(ts.tableName.toLowerCase(), ts);
                            }
                        } else {
                            // This is the only place where isQuery is set to true; it defaults to false.
                            qb.setIsQuery(true);
                            Path stagingPath = getStagingDirectoryPathname(qb);
                            fname = stagingPath.toString();
                            ctx.setResDir(stagingPath);
                        }
                    }
                    boolean isDfsFile = true;
                    if (ast.getChildCount() >= 2 && ast.getChild(1).getText().toLowerCase().equals("local")) {
                        isDfsFile = false;
                    }
                    // Set the destination for the SELECT query inside the CTAS
                    qb.getMetaData().setDestForAlias(name, fname, isDfsFile);
                    CreateTableDesc directoryDesc = new CreateTableDesc();
                    boolean directoryDescIsSet = false;
                    int numCh = ast.getChildCount();
                    for (int num = 1; num < numCh; num++) {
                        ASTNode child = (ASTNode) ast.getChild(num);
                        if (child != null) {
                            if (storageFormat.fillStorageFormat(child)) {
                                directoryDesc.setOutputFormat(storageFormat.getOutputFormat());
                                directoryDesc.setSerName(storageFormat.getSerde());
                                directoryDescIsSet = true;
                                continue;
                            }
                            switch(child.getToken().getType()) {
                                case HiveParser.TOK_TABLEROWFORMAT:
                                    rowFormatParams.analyzeRowFormat(child);
                                    directoryDesc.setFieldDelim(rowFormatParams.fieldDelim);
                                    directoryDesc.setLineDelim(rowFormatParams.lineDelim);
                                    directoryDesc.setCollItemDelim(rowFormatParams.collItemDelim);
                                    directoryDesc.setMapKeyDelim(rowFormatParams.mapKeyDelim);
                                    directoryDesc.setFieldEscape(rowFormatParams.fieldEscape);
                                    directoryDesc.setNullFormat(rowFormatParams.nullFormat);
                                    directoryDescIsSet = true;
                                    break;
                                case HiveParser.TOK_TABLESERIALIZER:
                                    ASTNode serdeChild = (ASTNode) child.getChild(0);
                                    storageFormat.setSerde(unescapeSQLString(serdeChild.getChild(0).getText()));
                                    directoryDesc.setSerName(storageFormat.getSerde());
                                    if (serdeChild.getChildCount() > 1) {
                                        directoryDesc.setSerdeProps(new HashMap<String, String>());
                                        readProps((ASTNode) serdeChild.getChild(1).getChild(0), directoryDesc.getSerdeProps());
                                    }
                                    directoryDescIsSet = true;
                                    break;
                            }
                        }
                    }
                    if (directoryDescIsSet) {
                        qb.setDirectoryDesc(directoryDesc);
                    }
                    break;
                }
            default:
                throw new SemanticException(generateErrorMessage(ast, "Unknown Token Type " + ast.getToken().getType()));
        }
    }
}

Also used : Warehouse(org.apache.hadoop.hive.metastore.Warehouse) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HiveOutputFormat(org.apache.hadoop.hive.ql.io.HiveOutputFormat) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) Path(org.apache.hadoop.fs.Path) Table(org.apache.hadoop.hive.ql.metadata.Table) IOException(java.io.IOException) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) PatternSyntaxException(java.util.regex.PatternSyntaxException) FileNotFoundException(java.io.FileNotFoundException) AccessControlException(java.security.AccessControlException) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) CreateTableDesc(org.apache.hadoop.hive.ql.plan.CreateTableDesc) RCFileInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) CombineHiveInputFormat(org.apache.hadoop.hive.ql.io.CombineHiveInputFormat) RCFileInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) NullRowsInputFormat(org.apache.hadoop.hive.ql.io.NullRowsInputFormat) ObjectPair(org.apache.hadoop.hive.common.ObjectPair)

Example 25 with InputFormat

use of org.apache.hadoop.mapred.InputFormat in project SQLWindowing by hbutani.

the class IOUtils method createTableWindowingInput.

@SuppressWarnings("unchecked")
public static WindowingInput createTableWindowingInput(String dbName, String tableName, Configuration conf) throws WindowingException {
    try {
        HiveMetaStoreClient client = HiveUtils.getClient(conf);
        String db = HiveUtils.validateDB(client, dbName);
        Table t = HiveUtils.getTable(client, db, tableName);
        StorageDescriptor sd = t.getSd();
        HiveConf hConf = new HiveConf(conf, IOUtils.class);
        JobConf job = new JobConf(hConf);
        Class<? extends InputFormat<? extends Writable, ? extends Writable>> inputFormatClass = (Class<? extends InputFormat<? extends Writable, ? extends Writable>>) Class.forName(sd.getInputFormat());
        hConf.setClass("mapred.input.format.class", inputFormatClass, InputFormat.class);
        hConf.set(INPUT_INPUTFORMAT_CLASS, inputFormatClass.getName());
        InputFormat<? extends Writable, ? extends Writable> iFmt = inputFormatClass.newInstance();
        if (iFmt instanceof TextInputFormat) {
            ((TextInputFormat) iFmt).configure(job);
        }
        Path p = new Path(sd.getLocation());
        /*
			 * Convert the Path in the StorageDescriptor into a Path in the current FileSystem.
			 * Used in testing: Jobs run on MiniDFSCluster, whereas hive metadata refers to a real cluster.
			 */
        {
            p = makeQualified(p, conf);
        }
        FileInputFormat.addInputPath(job, p);
        InputSplit[] iSplits = iFmt.getSplits(job, 1);
        org.apache.hadoop.mapred.RecordReader<Writable, Writable> rdr = (org.apache.hadoop.mapred.RecordReader<Writable, Writable>) iFmt.getRecordReader(iSplits[0], job, Reporter.NULL);
        hConf.set(INPUT_PATH, sd.getLocation());
        hConf.set(INPUT_KEY_CLASS, rdr.createKey().getClass().getName());
        hConf.set(INPUT_VALUE_CLASS, rdr.createValue().getClass().getName());
        hConf.set(INPUT_SERDE_CLASS, sd.getSerdeInfo().getSerializationLib());
        TableWindowingInput tIn = new TableWindowingInput();
        tIn.initialize(null, hConf, MetaStoreUtils.getSchema(t));
        return tIn;
    } catch (WindowingException w) {
        throw w;
    } catch (Exception e) {
        throw new WindowingException(e);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) HiveMetaStoreClient(org.apache.hadoop.hive.metastore.HiveMetaStoreClient) Table(org.apache.hadoop.hive.metastore.api.Table) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) WindowingException(com.sap.hadoop.windowing.WindowingException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) WindowingException(com.sap.hadoop.windowing.WindowingException) HiveConf(org.apache.hadoop.hive.conf.HiveConf) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Aggregations

InputFormat (org.apache.hadoop.mapred.InputFormat)34 Path (org.apache.hadoop.fs.Path)23 JobConf (org.apache.hadoop.mapred.JobConf)20 InputSplit (org.apache.hadoop.mapred.InputSplit)19 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)15 IOException (java.io.IOException)11 FileSystem (org.apache.hadoop.fs.FileSystem)8 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 Configuration (org.apache.hadoop.conf.Configuration)6 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)6 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)6 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)5 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)5 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)5 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)5 NullWritable (org.apache.hadoop.io.NullWritable)5 Mapper (org.apache.hadoop.mapred.Mapper)5 Test (org.junit.Test)5