Search in sources :

Example 11 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class MapredCarbonInputFormat method getSplits.

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    jobConf.set(DATABASE_NAME, "_dummyDb_" + UUID.randomUUID().toString());
    jobConf.set(TABLE_NAME, "_dummyTable_" + UUID.randomUUID().toString());
    org.apache.hadoop.mapreduce.JobContext jobContext = Job.getInstance(jobConf);
    CarbonTable carbonTable;
    try {
        carbonTable = getCarbonTable(jobContext.getConfiguration(), jobContext.getConfiguration().get(hive_metastoreConstants.META_TABLE_LOCATION));
    } catch (FileNotFoundException e) {
        return new InputSplit[0];
    } catch (Exception e) {
        throw new IOException("Unable read Carbon Schema: ", e);
    }
    List<String> partitionNames = new ArrayList<>();
    if (carbonTable.isHivePartitionTable()) {
        String partitionPath = FileFactory.getCarbonFile(jobContext.getConfiguration().get(FileInputFormat.INPUT_DIR)).getAbsolutePath();
        partitionNames.add(partitionPath.substring(carbonTable.getTablePath().length()));
        List<PartitionSpec> partitionSpec = new ArrayList<>();
        partitionSpec.add(new PartitionSpec(partitionNames, partitionPath));
        setPartitionsToPrune(jobContext.getConfiguration(), partitionSpec);
    }
    try {
        setFilterPredicates(jobContext.getConfiguration(), carbonTable);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    CarbonInputFormat<Void> carbonInputFormat;
    if (carbonTable.isTransactionalTable()) {
        carbonInputFormat = new CarbonTableInputFormat<>();
        jobContext.getConfiguration().set(CARBON_TRANSACTIONAL_TABLE, "true");
    } else {
        carbonInputFormat = new CarbonFileInputFormat<>();
    }
    List<org.apache.hadoop.mapreduce.InputSplit> splitList;
    try {
        splitList = carbonInputFormat.getSplits(jobContext);
    } catch (IOException ex) {
        LOGGER.error("Unable to get splits: ", ex);
        if (ex.getMessage().contains("No Index files are present in the table location :") || ex.getMessage().contains("CarbonData file is not present in the table location")) {
            splitList = new ArrayList<>();
        } else {
            throw ex;
        }
    }
    InputSplit[] splits = new InputSplit[splitList.size()];
    CarbonInputSplit split;
    for (int i = 0; i < splitList.size(); i++) {
        split = (CarbonInputSplit) splitList.get(i);
        CarbonHiveInputSplit inputSplit = new CarbonHiveInputSplit(split.getSegmentId(), split.getPath(), split.getStart(), split.getLength(), split.getLocations(), split.getNumberOfBlocklets(), split.getVersion(), split.getBlockStorageIdMap(), split.getDetailInfo());
        splits[i] = inputSplit;
    }
    return splits;
}
Also used : FileNotFoundException(java.io.FileNotFoundException) ArrayList(java.util.ArrayList) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) IOException(java.io.IOException) PartitionSpec(org.apache.carbondata.core.indexstore.PartitionSpec) SQLException(java.sql.SQLException) InvalidConfigurationException(org.apache.carbondata.core.exception.InvalidConfigurationException) InvalidPathException(org.apache.hadoop.fs.InvalidPathException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable)

Example 12 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonLocalMultiBlockSplit method convertSplit.

public static CarbonMultiBlockSplit convertSplit(String multiSplitJson) {
    Gson gson = new Gson();
    CarbonLocalMultiBlockSplit carbonLocalMultiBlockSplit = gson.fromJson(multiSplitJson, CarbonLocalMultiBlockSplit.class);
    List<CarbonInputSplit> carbonInputSplitList = carbonLocalMultiBlockSplit.getSplitList().stream().map(CarbonLocalInputSplit::convertSplit).collect(Collectors.toList());
    CarbonMultiBlockSplit carbonMultiBlockSplit = new CarbonMultiBlockSplit(carbonInputSplitList, carbonLocalMultiBlockSplit.getLocations());
    carbonMultiBlockSplit.setFileFormat(carbonLocalMultiBlockSplit.getFileFormat());
    return carbonMultiBlockSplit;
}
Also used : CarbonMultiBlockSplit(org.apache.carbondata.hadoop.CarbonMultiBlockSplit) Gson(com.google.gson.Gson) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit)

Example 13 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonLocalInputSplit method convertSplit.

public static CarbonInputSplit convertSplit(CarbonLocalInputSplit carbonLocalInputSplit) {
    CarbonInputSplit inputSplit = new CarbonInputSplit(carbonLocalInputSplit.getSegmentId(), carbonLocalInputSplit.getBlockletId(), carbonLocalInputSplit.getPath(), carbonLocalInputSplit.getStart(), carbonLocalInputSplit.getLength(), carbonLocalInputSplit.getLocations().toArray(new String[carbonLocalInputSplit.getLocations().size()]), carbonLocalInputSplit.getNumberOfBlocklets(), ColumnarFormatVersion.valueOf(carbonLocalInputSplit.getVersion()), carbonLocalInputSplit.getDeleteDeltaFiles());
    inputSplit.setFormat(carbonLocalInputSplit.getFileFormat());
    if (FileFormat.COLUMNAR_V3.ordinal() == inputSplit.getFileFormat().ordinal() && null != carbonLocalInputSplit.detailInfo && !carbonLocalInputSplit.detailInfo.equalsIgnoreCase("null")) {
        GsonBuilder gsonBuilder = new GsonBuilder();
        // add typeAdapter for DataType Class for deserialization
        gsonBuilder.registerTypeAdapter(DataType.class, new DataTypeDeserializer());
        Gson gson = gsonBuilder.create();
        BlockletDetailInfo blockletDetailInfo = gson.fromJson(carbonLocalInputSplit.detailInfo, BlockletDetailInfo.class);
        if (null == blockletDetailInfo) {
            throw new RuntimeException("Could not read blocklet details");
        }
        try {
            blockletDetailInfo.readColumnSchema(blockletDetailInfo.getColumnSchemaBinary());
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        inputSplit.setDetailInfo(blockletDetailInfo);
    }
    return inputSplit;
}
Also used : DataTypeDeserializer(org.apache.carbondata.core.metadata.datatype.DataTypeDeserializer) GsonBuilder(com.google.gson.GsonBuilder) BlockletDetailInfo(org.apache.carbondata.core.indexstore.BlockletDetailInfo) Gson(com.google.gson.Gson) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IOException(java.io.IOException)

Example 14 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonReaderBuilder method prepareFileInputFormat.

private CarbonFileInputFormat prepareFileInputFormat(Job job, boolean enableBlockletDistribution, boolean disableLoadBlockIndex) throws IOException {
    if (inputSplit != null && inputSplit instanceof CarbonInputSplit) {
        tablePath = ((CarbonInputSplit) inputSplit).getSegment().getReadCommittedScope().getFilePath();
        tableName = "UnknownTable" + UUID.randomUUID();
    }
    if (null == this.fileLists && null == tablePath) {
        throw new IllegalArgumentException("Please set table path first.");
    }
    // infer schema
    CarbonTable table;
    if (null != this.fileLists) {
        if (fileLists.size() < 1) {
            throw new IllegalArgumentException("fileLists must have one file in list as least!");
        }
        String commonString = String.valueOf(fileLists.get(0));
        for (int i = 1; i < fileLists.size(); i++) {
            commonString = commonString.substring(0, StringUtils.indexOfDifference(commonString, String.valueOf(fileLists.get(i))));
        }
        int index = commonString.lastIndexOf("/");
        commonString = commonString.substring(0, index);
        table = CarbonTable.buildTable(commonString, tableName, hadoopConf);
    } else {
        table = CarbonTable.buildTable(tablePath, tableName, hadoopConf);
    }
    if (enableBlockletDistribution) {
        // set cache level to blocklet level
        Map<String, String> tableProperties = table.getTableInfo().getFactTable().getTableProperties();
        tableProperties.put(CarbonCommonConstants.CACHE_LEVEL, "BLOCKLET");
        table.getTableInfo().getFactTable().setTableProperties(tableProperties);
    }
    final CarbonFileInputFormat format = new CarbonFileInputFormat();
    format.setTableInfo(job.getConfiguration(), table.getTableInfo());
    format.setTablePath(job.getConfiguration(), table.getTablePath());
    format.setTableName(job.getConfiguration(), table.getTableName());
    format.setDatabaseName(job.getConfiguration(), table.getDatabaseName());
    if (filterExpression != null) {
        format.setFilterPredicates(job.getConfiguration(), new IndexFilter(table, filterExpression, true));
    }
    if (null != this.fileLists) {
        format.setFileLists(this.fileLists);
    }
    if (projectionColumns != null) {
        // set the user projection
        int len = projectionColumns.length;
        for (int i = 0; i < len; i++) {
            if (projectionColumns[i].contains(".")) {
                throw new UnsupportedOperationException("Complex child columns projection NOT supported through CarbonReader");
            }
        }
        format.setColumnProjection(job.getConfiguration(), projectionColumns);
    }
    if ((disableLoadBlockIndex) && (filterExpression == null)) {
        job.getConfiguration().set("filter_blocks", "false");
    }
    return format;
}
Also used : CarbonTable(org.apache.carbondata.core.metadata.schema.table.CarbonTable) CarbonFileInputFormat(org.apache.carbondata.hadoop.api.CarbonFileInputFormat) CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) IndexFilter(org.apache.carbondata.core.index.IndexFilter)

Example 15 with CarbonInputSplit

use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.

the class CarbonCompactionUtil method getTaskCountForSegment.

public static int getTaskCountForSegment(CarbonInputSplit[] splits) {
    Set<String> taskIdSet = new HashSet<>();
    for (CarbonInputSplit split : splits) {
        String taskId = split.taskId;
        taskIdSet.add(taskId);
    }
    return taskIdSet.size();
}
Also used : CarbonInputSplit(org.apache.carbondata.hadoop.CarbonInputSplit) HashSet(java.util.HashSet)

Aggregations

CarbonInputSplit (org.apache.carbondata.hadoop.CarbonInputSplit)33 ArrayList (java.util.ArrayList)17 IOException (java.io.IOException)15 InputSplit (org.apache.hadoop.mapreduce.InputSplit)10 CarbonTable (org.apache.carbondata.core.metadata.schema.table.CarbonTable)8 LinkedList (java.util.LinkedList)6 CarbonMultiBlockSplit (org.apache.carbondata.hadoop.CarbonMultiBlockSplit)6 IndexFilter (org.apache.carbondata.core.index.IndexFilter)5 CarbonTablePath (org.apache.carbondata.core.util.path.CarbonTablePath)5 HashMap (java.util.HashMap)4 HashSet (java.util.HashSet)4 List (java.util.List)4 TableBlockInfo (org.apache.carbondata.core.datastore.block.TableBlockInfo)4 PartitionSpec (org.apache.carbondata.core.indexstore.PartitionSpec)4 LoadMetadataDetails (org.apache.carbondata.core.statusmanager.LoadMetadataDetails)4 SegmentUpdateStatusManager (org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager)4 CarbonTableInputFormat (org.apache.carbondata.hadoop.api.CarbonTableInputFormat)4 Configuration (org.apache.hadoop.conf.Configuration)4 Path (org.apache.hadoop.fs.Path)4 Gson (com.google.gson.Gson)3