Search in sources :

Example 1 with HiveReadEntry

use of org.apache.drill.exec.store.hive.HiveReadEntry in project drill by axbaretto.

the class HivePartitionDescriptor method createNewGroupScan.

private GroupScan createNewGroupScan(List<PartitionLocation> newPartitionLocations) throws ExecutionSetupException {
    HiveScan hiveScan = (HiveScan) scanRel.getGroupScan();
    HiveReadEntry origReadEntry = hiveScan.getHiveReadEntry();
    List<HiveTableWrapper.HivePartitionWrapper> oldPartitions = origReadEntry.partitions;
    List<HiveTableWrapper.HivePartitionWrapper> newPartitions = Lists.newLinkedList();
    for (HiveTableWrapper.HivePartitionWrapper part : oldPartitions) {
        String partitionLocation = part.getPartition().getSd().getLocation();
        for (PartitionLocation newPartitionLocation : newPartitionLocations) {
            if (partitionLocation.equals(newPartitionLocation.getEntirePartitionLocation())) {
                newPartitions.add(part);
            }
        }
    }
    HiveReadEntry newReadEntry = new HiveReadEntry(origReadEntry.table, newPartitions);
    return hiveScan.clone(newReadEntry);
}
Also used : HiveReadEntry(org.apache.drill.exec.store.hive.HiveReadEntry) HiveScan(org.apache.drill.exec.store.hive.HiveScan) HiveTableWrapper(org.apache.drill.exec.store.hive.HiveTableWrapper) PartitionLocation(org.apache.drill.exec.planner.PartitionLocation)

Example 2 with HiveReadEntry

use of org.apache.drill.exec.store.hive.HiveReadEntry in project drill by apache.

the class HivePartitionDescriptor method createNewGroupScan.

private GroupScan createNewGroupScan(List<PartitionLocation> newPartitionLocations) throws ExecutionSetupException {
    HiveScan hiveScan = (HiveScan) scanRel.getGroupScan();
    HiveReadEntry origReadEntry = hiveScan.getHiveReadEntry();
    List<HiveTableWrapper.HivePartitionWrapper> oldPartitions = origReadEntry.partitions;
    List<HiveTableWrapper.HivePartitionWrapper> newPartitions = Lists.newLinkedList();
    for (HiveTableWrapper.HivePartitionWrapper part : oldPartitions) {
        Path partitionLocation = new Path(part.getPartition().getSd().getLocation());
        for (PartitionLocation newPartitionLocation : newPartitionLocations) {
            if (partitionLocation.equals(newPartitionLocation.getEntirePartitionLocation())) {
                newPartitions.add(part);
            }
        }
    }
    HiveReadEntry newReadEntry = new HiveReadEntry(origReadEntry.table, newPartitions);
    return hiveScan.clone(newReadEntry);
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) HiveReadEntry(org.apache.drill.exec.store.hive.HiveReadEntry) HiveScan(org.apache.drill.exec.store.hive.HiveScan) HiveTableWrapper(org.apache.drill.exec.store.hive.HiveTableWrapper) PartitionLocation(org.apache.drill.exec.planner.PartitionLocation)

Example 3 with HiveReadEntry

use of org.apache.drill.exec.store.hive.HiveReadEntry in project drill by apache.

the class ConvertHiveMapRDBJsonScanToDrillMapRDBJsonScan method createNativeScanRel.

/**
 * Helper method which creates a DrillScanRel with native Drill HiveScan.
 */
private DrillScanRel createNativeScanRel(DrillScanRel hiveScanRel, PlannerSettings settings) throws IOException {
    RelDataTypeFactory typeFactory = hiveScanRel.getCluster().getTypeFactory();
    HiveScan hiveScan = (HiveScan) hiveScanRel.getGroupScan();
    HiveReadEntry hiveReadEntry = hiveScan.getHiveReadEntry();
    Map<String, String> parameters = hiveReadEntry.getHiveTableWrapper().getParameters();
    JsonScanSpec scanSpec = new JsonScanSpec(parameters.get(MAPRDB_TABLE_NAME), null, null);
    List<SchemaPath> hiveScanCols = hiveScanRel.getColumns().stream().map(colNameSchemaPath -> replaceOverriddenSchemaPath(parameters, colNameSchemaPath)).collect(Collectors.toList());
    // creates TupleMetadata based on Hive's schema (with optional data modes) to be used in the reader
    // for the case when column type wasn't discovered
    HiveToRelDataTypeConverter dataTypeConverter = new HiveToRelDataTypeConverter(typeFactory);
    TupleMetadata schema = new TupleSchema();
    hiveReadEntry.getTable().getColumnListsCache().getTableSchemaColumns().forEach(column -> schema.addColumn(HiveUtilities.getColumnMetadata(replaceOverriddenColumnId(parameters, column.getName()), dataTypeConverter.convertToNullableRelDataType(column))));
    MapRDBFormatPluginConfig formatConfig = new MapRDBFormatPluginConfig();
    formatConfig.readTimestampWithZoneOffset = settings.getOptions().getBoolean(ExecConstants.HIVE_READ_MAPRDB_JSON_TIMESTAMP_WITH_TIMEZONE_OFFSET);
    formatConfig.allTextMode = settings.getOptions().getBoolean(ExecConstants.HIVE_MAPRDB_JSON_ALL_TEXT_MODE);
    JsonTableGroupScan nativeMapRDBScan = new JsonTableGroupScan(hiveScan.getUserName(), hiveScan.getStoragePlugin(), // TODO: We should use Hive format plugins here, once it will be implemented. DRILL-6621
    (MapRDBFormatPlugin) hiveScan.getStoragePlugin().getFormatPlugin(formatConfig), scanSpec, hiveScanCols, new MapRDBStatistics(), FileSystemMetadataProviderManager.getMetadataProviderForSchema(schema));
    List<String> nativeScanColNames = hiveScanRel.getRowType().getFieldList().stream().map(field -> replaceOverriddenColumnId(parameters, field.getName())).collect(Collectors.toList());
    List<RelDataType> nativeScanColTypes = hiveScanRel.getRowType().getFieldList().stream().map(RelDataTypeField::getType).collect(Collectors.toList());
    RelDataType nativeScanRowType = typeFactory.createStructType(nativeScanColTypes, nativeScanColNames);
    return new DrillScanRel(hiveScanRel.getCluster(), hiveScanRel.getTraitSet(), hiveScanRel.getTable(), nativeMapRDBScan, nativeScanRowType, hiveScanCols);
}
Also used : JsonScanSpec(org.apache.drill.exec.store.mapr.db.json.JsonScanSpec) RelDataTypeFactory(org.apache.calcite.rel.type.RelDataTypeFactory) DrillScanRel(org.apache.drill.exec.planner.logical.DrillScanRel) HiveUtilities(org.apache.drill.exec.store.hive.HiveUtilities) MapRDBFormatPluginConfig(org.apache.drill.exec.store.mapr.db.MapRDBFormatPluginConfig) HiveToRelDataTypeConverter(org.apache.drill.exec.planner.types.HiveToRelDataTypeConverter) RelOptHelper(org.apache.drill.exec.planner.logical.RelOptHelper) Map(java.util.Map) MapRDBStatistics(org.apache.drill.exec.planner.index.MapRDBStatistics) MapRDBFormatPlugin(org.apache.drill.exec.store.mapr.db.MapRDBFormatPlugin) TupleSchema(org.apache.drill.exec.record.metadata.TupleSchema) RelDataType(org.apache.calcite.rel.type.RelDataType) PrelUtil(org.apache.drill.exec.planner.physical.PrelUtil) StoragePluginOptimizerRule(org.apache.drill.exec.store.StoragePluginOptimizerRule) HiveMapRDBJsonInputFormat(org.apache.hadoop.hive.maprdb.json.input.HiveMapRDBJsonInputFormat) HiveMetadataProvider(org.apache.drill.exec.store.hive.HiveMetadataProvider) SchemaPath(org.apache.drill.common.expression.SchemaPath) IOException(java.io.IOException) FileSystemMetadataProviderManager(org.apache.drill.exec.metastore.store.FileSystemMetadataProviderManager) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) Collectors(java.util.stream.Collectors) RelOptRuleCall(org.apache.calcite.plan.RelOptRuleCall) HiveScan(org.apache.drill.exec.store.hive.HiveScan) HiveReadEntry(org.apache.drill.exec.store.hive.HiveReadEntry) List(java.util.List) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) RelDataTypeField(org.apache.calcite.rel.type.RelDataTypeField) JsonTableGroupScan(org.apache.drill.exec.store.mapr.db.json.JsonTableGroupScan) ExecConstants(org.apache.drill.exec.ExecConstants) DocumentConstants(org.ojai.DocumentConstants) MapRDBStatistics(org.apache.drill.exec.planner.index.MapRDBStatistics) DrillScanRel(org.apache.drill.exec.planner.logical.DrillScanRel) MapRDBFormatPluginConfig(org.apache.drill.exec.store.mapr.db.MapRDBFormatPluginConfig) RelDataType(org.apache.calcite.rel.type.RelDataType) TupleSchema(org.apache.drill.exec.record.metadata.TupleSchema) HiveReadEntry(org.apache.drill.exec.store.hive.HiveReadEntry) SchemaPath(org.apache.drill.common.expression.SchemaPath) TupleMetadata(org.apache.drill.exec.record.metadata.TupleMetadata) RelDataTypeFactory(org.apache.calcite.rel.type.RelDataTypeFactory) HiveScan(org.apache.drill.exec.store.hive.HiveScan) HiveToRelDataTypeConverter(org.apache.drill.exec.planner.types.HiveToRelDataTypeConverter) JsonTableGroupScan(org.apache.drill.exec.store.mapr.db.json.JsonTableGroupScan) JsonScanSpec(org.apache.drill.exec.store.mapr.db.json.JsonScanSpec)

Example 4 with HiveReadEntry

use of org.apache.drill.exec.store.hive.HiveReadEntry in project drill by apache.

the class ConvertHiveMapRDBJsonScanToDrillMapRDBJsonScan method onMatch.

@Override
public void onMatch(RelOptRuleCall call) {
    try {
        DrillScanRel hiveScanRel = call.rel(0);
        PlannerSettings settings = PrelUtil.getPlannerSettings(call.getPlanner());
        HiveScan hiveScan = (HiveScan) hiveScanRel.getGroupScan();
        HiveReadEntry hiveReadEntry = hiveScan.getHiveReadEntry();
        HiveMetadataProvider hiveMetadataProvider = new HiveMetadataProvider(hiveScan.getUserName(), hiveReadEntry, hiveScan.getHiveConf());
        if (hiveMetadataProvider.getInputSplits(hiveReadEntry).isEmpty()) {
            // table is empty, use original scan
            return;
        }
        if (hiveScan.getHiveReadEntry().getTable().isSetPartitionKeys()) {
            logger.warn("Hive MapR-DB JSON Handler doesn't support table partitioning. Consider recreating table without " + "partitions");
        }
        DrillScanRel nativeScanRel = createNativeScanRel(hiveScanRel, settings);
        call.transformTo(nativeScanRel);
        /*
        Drill native scan should take precedence over Hive since it's more efficient and faster.
        Hive does not always give correct costing (i.e. for external tables Hive does not have number of rows
        and we calculate them approximately). On the contrary, Drill calculates number of rows exactly
        and thus Hive Scan can be chosen instead of Drill native scan because costings allegedly lower for Hive.
        To ensure Drill MapR-DB Json scan will be chosen, reduce Hive scan importance to 0.
       */
        call.getPlanner().setImportance(hiveScanRel, 0.0);
    } catch (Exception e) {
        // TODO: Improve error handling after allowing to throw IOException from StoragePlugin.getFormatPlugin()
        logger.warn("Failed to convert HiveScan to JsonScanSpec. Fallback to HiveMapR-DB connector.", e);
    }
}
Also used : DrillScanRel(org.apache.drill.exec.planner.logical.DrillScanRel) HiveReadEntry(org.apache.drill.exec.store.hive.HiveReadEntry) PlannerSettings(org.apache.drill.exec.planner.physical.PlannerSettings) HiveScan(org.apache.drill.exec.store.hive.HiveScan) HiveMetadataProvider(org.apache.drill.exec.store.hive.HiveMetadataProvider) IOException(java.io.IOException)

Example 5 with HiveReadEntry

use of org.apache.drill.exec.store.hive.HiveReadEntry in project drill by axbaretto.

the class HivePartitionDescriptor method createPartitionSublists.

@Override
protected void createPartitionSublists() {
    List<PartitionLocation> locations = new LinkedList<>();
    HiveReadEntry origEntry = ((HiveScan) scanRel.getGroupScan()).getHiveReadEntry();
    for (Partition partition : origEntry.getPartitions()) {
        locations.add(new HivePartitionLocation(partition.getValues(), partition.getSd().getLocation()));
    }
    locationSuperList = Lists.partition(locations, PartitionDescriptor.PARTITION_BATCH_SIZE);
    sublistsCreated = true;
}
Also used : Partition(org.apache.hadoop.hive.metastore.api.Partition) HiveReadEntry(org.apache.drill.exec.store.hive.HiveReadEntry) HiveScan(org.apache.drill.exec.store.hive.HiveScan) PartitionLocation(org.apache.drill.exec.planner.PartitionLocation) LinkedList(java.util.LinkedList)

Aggregations

HiveReadEntry (org.apache.drill.exec.store.hive.HiveReadEntry)8 HiveScan (org.apache.drill.exec.store.hive.HiveScan)7 PartitionLocation (org.apache.drill.exec.planner.PartitionLocation)4 IOException (java.io.IOException)3 SchemaPath (org.apache.drill.common.expression.SchemaPath)3 DrillScanRel (org.apache.drill.exec.planner.logical.DrillScanRel)3 PlannerSettings (org.apache.drill.exec.planner.physical.PlannerSettings)3 HiveMetadataProvider (org.apache.drill.exec.store.hive.HiveMetadataProvider)3 HiveTableWrapper (org.apache.drill.exec.store.hive.HiveTableWrapper)3 Partition (org.apache.hadoop.hive.metastore.api.Partition)3 LinkedList (java.util.LinkedList)2 Path (org.apache.hadoop.fs.Path)2 Table (org.apache.hadoop.hive.metastore.api.Table)2 List (java.util.List)1 Map (java.util.Map)1 Collectors (java.util.stream.Collectors)1 RelOptRuleCall (org.apache.calcite.plan.RelOptRuleCall)1 RelDataType (org.apache.calcite.rel.type.RelDataType)1 RelDataTypeFactory (org.apache.calcite.rel.type.RelDataTypeFactory)1 RelDataTypeField (org.apache.calcite.rel.type.RelDataTypeField)1