use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class MapredCarbonInputFormat method getSplits.
@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
jobConf.set(DATABASE_NAME, "_dummyDb_" + UUID.randomUUID().toString());
jobConf.set(TABLE_NAME, "_dummyTable_" + UUID.randomUUID().toString());
org.apache.hadoop.mapreduce.JobContext jobContext = Job.getInstance(jobConf);
CarbonTable carbonTable;
try {
carbonTable = getCarbonTable(jobContext.getConfiguration(), jobContext.getConfiguration().get(hive_metastoreConstants.META_TABLE_LOCATION));
} catch (FileNotFoundException e) {
return new InputSplit[0];
} catch (Exception e) {
throw new IOException("Unable read Carbon Schema: ", e);
}
List<String> partitionNames = new ArrayList<>();
if (carbonTable.isHivePartitionTable()) {
String partitionPath = FileFactory.getCarbonFile(jobContext.getConfiguration().get(FileInputFormat.INPUT_DIR)).getAbsolutePath();
partitionNames.add(partitionPath.substring(carbonTable.getTablePath().length()));
List<PartitionSpec> partitionSpec = new ArrayList<>();
partitionSpec.add(new PartitionSpec(partitionNames, partitionPath));
setPartitionsToPrune(jobContext.getConfiguration(), partitionSpec);
}
try {
setFilterPredicates(jobContext.getConfiguration(), carbonTable);
} catch (Exception e) {
throw new RuntimeException(e);
}
CarbonInputFormat<Void> carbonInputFormat;
if (carbonTable.isTransactionalTable()) {
carbonInputFormat = new CarbonTableInputFormat<>();
jobContext.getConfiguration().set(CARBON_TRANSACTIONAL_TABLE, "true");
} else {
carbonInputFormat = new CarbonFileInputFormat<>();
}
List<org.apache.hadoop.mapreduce.InputSplit> splitList;
try {
splitList = carbonInputFormat.getSplits(jobContext);
} catch (IOException ex) {
LOGGER.error("Unable to get splits: ", ex);
if (ex.getMessage().contains("No Index files are present in the table location :") || ex.getMessage().contains("CarbonData file is not present in the table location")) {
splitList = new ArrayList<>();
} else {
throw ex;
}
}
InputSplit[] splits = new InputSplit[splitList.size()];
CarbonInputSplit split;
for (int i = 0; i < splitList.size(); i++) {
split = (CarbonInputSplit) splitList.get(i);
CarbonHiveInputSplit inputSplit = new CarbonHiveInputSplit(split.getSegmentId(), split.getPath(), split.getStart(), split.getLength(), split.getLocations(), split.getNumberOfBlocklets(), split.getVersion(), split.getBlockStorageIdMap(), split.getDetailInfo());
splits[i] = inputSplit;
}
return splits;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonLocalMultiBlockSplit method convertSplit.
public static CarbonMultiBlockSplit convertSplit(String multiSplitJson) {
Gson gson = new Gson();
CarbonLocalMultiBlockSplit carbonLocalMultiBlockSplit = gson.fromJson(multiSplitJson, CarbonLocalMultiBlockSplit.class);
List<CarbonInputSplit> carbonInputSplitList = carbonLocalMultiBlockSplit.getSplitList().stream().map(CarbonLocalInputSplit::convertSplit).collect(Collectors.toList());
CarbonMultiBlockSplit carbonMultiBlockSplit = new CarbonMultiBlockSplit(carbonInputSplitList, carbonLocalMultiBlockSplit.getLocations());
carbonMultiBlockSplit.setFileFormat(carbonLocalMultiBlockSplit.getFileFormat());
return carbonMultiBlockSplit;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonLocalInputSplit method convertSplit.
public static CarbonInputSplit convertSplit(CarbonLocalInputSplit carbonLocalInputSplit) {
CarbonInputSplit inputSplit = new CarbonInputSplit(carbonLocalInputSplit.getSegmentId(), carbonLocalInputSplit.getBlockletId(), carbonLocalInputSplit.getPath(), carbonLocalInputSplit.getStart(), carbonLocalInputSplit.getLength(), carbonLocalInputSplit.getLocations().toArray(new String[carbonLocalInputSplit.getLocations().size()]), carbonLocalInputSplit.getNumberOfBlocklets(), ColumnarFormatVersion.valueOf(carbonLocalInputSplit.getVersion()), carbonLocalInputSplit.getDeleteDeltaFiles());
inputSplit.setFormat(carbonLocalInputSplit.getFileFormat());
if (FileFormat.COLUMNAR_V3.ordinal() == inputSplit.getFileFormat().ordinal() && null != carbonLocalInputSplit.detailInfo && !carbonLocalInputSplit.detailInfo.equalsIgnoreCase("null")) {
GsonBuilder gsonBuilder = new GsonBuilder();
// add typeAdapter for DataType Class for deserialization
gsonBuilder.registerTypeAdapter(DataType.class, new DataTypeDeserializer());
Gson gson = gsonBuilder.create();
BlockletDetailInfo blockletDetailInfo = gson.fromJson(carbonLocalInputSplit.detailInfo, BlockletDetailInfo.class);
if (null == blockletDetailInfo) {
throw new RuntimeException("Could not read blocklet details");
}
try {
blockletDetailInfo.readColumnSchema(blockletDetailInfo.getColumnSchemaBinary());
} catch (IOException e) {
throw new RuntimeException(e);
}
inputSplit.setDetailInfo(blockletDetailInfo);
}
return inputSplit;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonReaderBuilder method prepareFileInputFormat.
private CarbonFileInputFormat prepareFileInputFormat(Job job, boolean enableBlockletDistribution, boolean disableLoadBlockIndex) throws IOException {
if (inputSplit != null && inputSplit instanceof CarbonInputSplit) {
tablePath = ((CarbonInputSplit) inputSplit).getSegment().getReadCommittedScope().getFilePath();
tableName = "UnknownTable" + UUID.randomUUID();
}
if (null == this.fileLists && null == tablePath) {
throw new IllegalArgumentException("Please set table path first.");
}
// infer schema
CarbonTable table;
if (null != this.fileLists) {
if (fileLists.size() < 1) {
throw new IllegalArgumentException("fileLists must have one file in list as least!");
}
String commonString = String.valueOf(fileLists.get(0));
for (int i = 1; i < fileLists.size(); i++) {
commonString = commonString.substring(0, StringUtils.indexOfDifference(commonString, String.valueOf(fileLists.get(i))));
}
int index = commonString.lastIndexOf("/");
commonString = commonString.substring(0, index);
table = CarbonTable.buildTable(commonString, tableName, hadoopConf);
} else {
table = CarbonTable.buildTable(tablePath, tableName, hadoopConf);
}
if (enableBlockletDistribution) {
// set cache level to blocklet level
Map<String, String> tableProperties = table.getTableInfo().getFactTable().getTableProperties();
tableProperties.put(CarbonCommonConstants.CACHE_LEVEL, "BLOCKLET");
table.getTableInfo().getFactTable().setTableProperties(tableProperties);
}
final CarbonFileInputFormat format = new CarbonFileInputFormat();
format.setTableInfo(job.getConfiguration(), table.getTableInfo());
format.setTablePath(job.getConfiguration(), table.getTablePath());
format.setTableName(job.getConfiguration(), table.getTableName());
format.setDatabaseName(job.getConfiguration(), table.getDatabaseName());
if (filterExpression != null) {
format.setFilterPredicates(job.getConfiguration(), new IndexFilter(table, filterExpression, true));
}
if (null != this.fileLists) {
format.setFileLists(this.fileLists);
}
if (projectionColumns != null) {
// set the user projection
int len = projectionColumns.length;
for (int i = 0; i < len; i++) {
if (projectionColumns[i].contains(".")) {
throw new UnsupportedOperationException("Complex child columns projection NOT supported through CarbonReader");
}
}
format.setColumnProjection(job.getConfiguration(), projectionColumns);
}
if ((disableLoadBlockIndex) && (filterExpression == null)) {
job.getConfiguration().set("filter_blocks", "false");
}
return format;
}
use of org.apache.carbondata.hadoop.CarbonInputSplit in project carbondata by apache.
the class CarbonCompactionUtil method getTaskCountForSegment.
public static int getTaskCountForSegment(CarbonInputSplit[] splits) {
Set<String> taskIdSet = new HashSet<>();
for (CarbonInputSplit split : splits) {
String taskId = split.taskId;
taskIdSet.add(taskId);
}
return taskIdSet.size();
}
Aggregations